MLflow Installation for ML Lifecycle Management

MLflow is an open-source platform for managing the machine learning lifecycle, including experiment tracking, model registry, and deployment, and can be self-hosted on a Linux VPS to give teams centralized visibility into training runs and model versions. This guide covers deploying the MLflow tracking server, configuring backend and artifact storage, using the Python client, and setting up multi-user access.

Prerequisites

  • Ubuntu 20.04+ or CentOS/Rocky Linux 8+
  • Python 3.8+
  • MySQL or PostgreSQL for production backend (SQLite works for development)
  • Nginx for reverse proxy (optional but recommended)
  • S3-compatible storage or local filesystem for artifacts

Installing MLflow

# Create a dedicated user
sudo useradd -m -s /bin/bash mlflow
sudo -u mlflow -i

# Create a virtual environment
python3 -m venv ~/mlflow-env
source ~/mlflow-env/bin/activate

# Install MLflow with extras
pip install --upgrade pip
pip install mlflow[extras]

# For MySQL backend support
pip install mysqlclient

# For PostgreSQL backend support
pip install psycopg2-binary

# Verify installation
mlflow --version

Running the Tracking Server

Development Mode (SQLite + local filesystem)

# Quick start — SQLite backend, local artifacts
mlflow server \
  --host 0.0.0.0 \
  --port 5000 \
  --backend-store-uri sqlite:///mlflow.db \
  --default-artifact-root /home/mlflow/artifacts

# Access at http://your-server:5000

Production Mode (MySQL + S3)

# Set up MySQL database first
sudo mysql -e "
  CREATE DATABASE mlflow CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
  CREATE USER 'mlflow'@'localhost' IDENTIFIED BY 'strong_password';
  GRANT ALL PRIVILEGES ON mlflow.* TO 'mlflow'@'localhost';
  FLUSH PRIVILEGES;
"

# Start server with MySQL backend and S3 artifacts
mlflow server \
  --host 127.0.0.1 \
  --port 5000 \
  --backend-store-uri "mysql+mysqldb://mlflow:strong_password@localhost/mlflow" \
  --default-artifact-root "s3://my-mlflow-artifacts/mlflow" \
  --serve-artifacts

Configuring Backend and Artifact Storage

Environment Variables for Clean Configuration

cat > /home/mlflow/.mlflow-env << 'EOF'
# Backend store
MLFLOW_BACKEND_STORE_URI=mysql+mysqldb://mlflow:strong_password@localhost/mlflow

# Artifact store
MLFLOW_DEFAULT_ARTIFACT_ROOT=s3://my-mlflow-artifacts/mlflow

# AWS credentials (if using S3 for artifacts)
AWS_ACCESS_KEY_ID=YOUR_KEY
AWS_SECRET_ACCESS_KEY=YOUR_SECRET
AWS_DEFAULT_REGION=us-east-1
EOF

# For S3-compatible storage (MinIO, Backblaze, Spaces):
cat >> /home/mlflow/.mlflow-env << 'EOF'
MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
EOF

Using MinIO for Artifacts

# Install MinIO client
pip install minio

# Configure boto3/botocore for MinIO
export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
export AWS_ACCESS_KEY_ID=minio_access_key
export AWS_SECRET_ACCESS_KEY=minio_secret_key

# Start server pointing to MinIO
mlflow server \
  --host 127.0.0.1 \
  --port 5000 \
  --backend-store-uri "mysql+mysqldb://mlflow:password@localhost/mlflow" \
  --default-artifact-root "s3://mlflow-bucket/artifacts" \
  --serve-artifacts

Running as a Systemd Service

sudo tee /etc/systemd/system/mlflow.service << 'EOF'
[Unit]
Description=MLflow Tracking Server
After=network.target mysql.service

[Service]
Type=simple
User=mlflow
WorkingDirectory=/home/mlflow
EnvironmentFile=/home/mlflow/.mlflow-env
ExecStart=/home/mlflow/mlflow-env/bin/mlflow server \
    --host 127.0.0.1 \
    --port 5000 \
    --backend-store-uri ${MLFLOW_BACKEND_STORE_URI} \
    --default-artifact-root ${MLFLOW_DEFAULT_ARTIFACT_ROOT} \
    --serve-artifacts
Restart=on-failure
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl daemon-reload
sudo systemctl enable mlflow
sudo systemctl start mlflow
sudo systemctl status mlflow

Nginx Reverse Proxy

sudo tee /etc/nginx/sites-available/mlflow << 'EOF'
server {
    listen 80;
    server_name mlflow.example.com;
    return 301 https://$host$request_uri;
}

server {
    listen 443 ssl;
    server_name mlflow.example.com;

    ssl_certificate /etc/letsencrypt/live/mlflow.example.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/mlflow.example.com/privkey.pem;

    # Basic authentication (optional)
    auth_basic "MLflow";
    auth_basic_user_file /etc/nginx/.mlflow-htpasswd;

    location / {
        proxy_pass http://127.0.0.1:5000;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        client_max_body_size 500M;  # Allow large model uploads
    }
}
EOF

sudo ln -s /etc/nginx/sites-available/mlflow /etc/nginx/sites-enabled/

# Create basic auth credentials
sudo apt-get install -y apache2-utils
sudo htpasswd -c /etc/nginx/.mlflow-htpasswd mlflow-user

sudo nginx -t && sudo systemctl reload nginx

Using MLflow in Python Experiments

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Connect to your MLflow server
mlflow.set_tracking_uri("https://mlflow.example.com")

# Create or select an experiment
mlflow.set_experiment("iris-classification")

# Load data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Run an experiment
with mlflow.start_run(run_name="random-forest-v1"):
    # Log parameters
    n_estimators = 100
    max_depth = 5
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Train model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)

    # Log metrics
    accuracy = accuracy_score(y_test, model.predict(X_test))
    mlflow.log_metric("accuracy", accuracy)

    # Log the model
    mlflow.sklearn.log_model(model, "model")

    # Log artifacts (plots, config files, etc.)
    mlflow.log_artifact("config.yaml")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

Autologging (Minimal Code)

import mlflow

# Enable automatic logging for common frameworks
mlflow.autolog()

# TensorFlow/Keras — autolog captures all metrics, params, model
import tensorflow as tf
mlflow.tensorflow.autolog()

# PyTorch Lightning
mlflow.pytorch.autolog()

# Scikit-learn
mlflow.sklearn.autolog()

Model Registry

import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient(tracking_uri="https://mlflow.example.com")

# Register a model from a run
model_uri = f"runs:/RUN_ID/model"
registered_model = mlflow.register_model(model_uri, "IrisClassifier")

# Transition a model version to Production
client.transition_model_version_stage(
    name="IrisClassifier",
    version=1,
    stage="Production",
    archive_existing_versions=True
)

# Load the production model
model = mlflow.sklearn.load_model("models:/IrisClassifier/Production")
predictions = model.predict(X_test)

Troubleshooting

"Connection refused" on tracking server

# Verify service is running
sudo systemctl status mlflow
sudo journalctl -u mlflow --since "5 minutes ago"

# Check if port 5000 is listening
ss -tlnp | grep 5000

MySQL connection errors

# Test database connection
mysql -u mlflow -p -e "USE mlflow; SHOW TABLES;"

# Verify the backend store URI is correct
echo $MLFLOW_BACKEND_STORE_URI

Artifact upload fails

# Check S3/MinIO credentials
env | grep AWS
env | grep MLFLOW_S3

# Test S3 access
aws s3 ls s3://my-mlflow-artifacts/ --endpoint-url http://localhost:9000

Large model uploads time out via Nginx

# Increase Nginx timeouts and body size
# In nginx config:
# client_max_body_size 1G;
# proxy_read_timeout 300;
# proxy_send_timeout 300;
sudo systemctl reload nginx

Conclusion

Self-hosted MLflow gives your team a centralized platform for tracking experiments, comparing model performance, and managing the full model lifecycle without sending data to third-party services. Backing the tracking server with MySQL and S3-compatible artifact storage ensures durability and scalability as your experiment volume grows.