MinIO for ML Model and Dataset Storage

MinIO is a high-performance, S3-compatible object storage server that can be self-hosted on your VPS, making it ideal for storing machine learning models, training datasets, and experiment artifacts without paying for cloud storage. This guide covers deploying MinIO, organizing buckets for ML workflows, enabling versioning, and integrating with Python ML tools.

Prerequisites

  • Ubuntu 20.04+ or CentOS/Rocky Linux 8+
  • Sufficient disk space (datasets can be hundreds of GB)
  • Python 3.8+ for client integration
  • 2GB+ RAM for the MinIO server

Installing MinIO Server

# Download the latest MinIO binary
wget https://dl.min.io/server/minio/release/linux-amd64/minio \
  -O /tmp/minio
sudo install /tmp/minio /usr/local/bin/minio

# Verify installation
minio --version

# Create data directory and MinIO user
sudo useradd -r -s /sbin/nologin minio-user
sudo mkdir -p /data/minio
sudo chown -R minio-user:minio-user /data/minio

# Create config directory
sudo mkdir -p /etc/minio

Running MinIO as a Service

# Create the environment file
sudo tee /etc/minio/minio.env << 'EOF'
# Root credentials (change these!)
MINIO_ROOT_USER=admin
MINIO_ROOT_PASSWORD=change_this_strong_password

# Data directory
MINIO_VOLUMES="/data/minio"

# Console port (web UI)
MINIO_CONSOLE_ADDRESS=":9001"

# API endpoint for public access
MINIO_SERVER_URL=https://minio.example.com
MINIO_BROWSER_REDIRECT_URL=https://minio-console.example.com
EOF

sudo chmod 600 /etc/minio/minio.env

# Create systemd service
sudo tee /etc/systemd/system/minio.service << 'EOF'
[Unit]
Description=MinIO Object Storage
After=network-online.target
Wants=network-online.target

[Service]
WorkingDirectory=/usr/local/
User=minio-user
Group=minio-user
EnvironmentFile=/etc/minio/minio.env
ExecStart=/usr/local/bin/minio server ${MINIO_VOLUMES} \
    --console-address ${MINIO_CONSOLE_ADDRESS}
Restart=on-failure
LimitNOFILE=65536
TasksMax=infinity
TimeoutStopSec=infinity

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl daemon-reload
sudo systemctl enable minio
sudo systemctl start minio

# Check status
sudo systemctl status minio
sudo journalctl -u minio -f

# API is accessible at http://your-server:9000
# Console at http://your-server:9001

Install the MinIO CLI client:

wget https://dl.min.io/client/mc/release/linux-amd64/mc \
  -O /tmp/mc
sudo install /tmp/mc /usr/local/bin/mc

# Configure alias
mc alias set local http://localhost:9000 admin change_this_strong_password

# Test connection
mc ls local/

Bucket Organization for ML Workflows

Structure your buckets to reflect ML workflow stages:

# Create buckets for different asset types
mc mb local/ml-datasets
mc mb local/ml-models
mc mb local/ml-experiments
mc mb local/ml-checkpoints

# Create structured prefixes within buckets
# Datasets bucket structure:
# ml-datasets/
#   raw/project-name/v1/
#   processed/project-name/v1/
#   splits/project-name/v1/train/
#   splits/project-name/v1/val/
#   splits/project-name/v1/test/

# Models bucket structure:
# ml-models/
#   staging/model-name/v1/
#   production/model-name/v2/
#   archived/model-name/v1/

# Upload a dataset
mc cp --recursive /data/datasets/imagenet/ \
  local/ml-datasets/raw/imagenet/v1/

# Upload a trained model
mc cp model.pkl local/ml-models/staging/classifier/v1/

# List contents
mc ls local/ml-models/production/
mc du local/ml-datasets/  # Disk usage

Enabling Versioning

Object versioning protects against accidental deletion and lets you track model and dataset history:

# Enable versioning on model and experiment buckets
mc version enable local/ml-models
mc version enable local/ml-experiments

# Check versioning status
mc version info local/ml-models

# List versions of a specific object
mc ls --versions local/ml-models/production/classifier/model.pkl

# Restore a specific version
mc cp --version-id VERSION_ID \
  local/ml-models/production/classifier/model.pkl \
  ./restored-model.pkl

# Set lifecycle to clean up old non-current versions
mc ilm rule add local/ml-models \
  --noncurrent-expire-days 90

Python SDK Integration

pip install minio boto3

Using the MinIO Python SDK:

from minio import Minio
from minio.error import S3Error
import os

# Connect to MinIO
client = Minio(
    "localhost:9000",
    access_key="admin",
    secret_key="change_this_strong_password",
    secure=False  # True if using HTTPS
)

# Create a bucket
if not client.bucket_exists("ml-models"):
    client.make_bucket("ml-models")

# Upload a model file
client.fput_object(
    "ml-models",
    "production/classifier/v2/model.pkl",
    "/local/path/model.pkl",
    metadata={"accuracy": "0.95", "framework": "sklearn", "version": "2.0"}
)

# Download a model
client.fget_object(
    "ml-models",
    "production/classifier/v2/model.pkl",
    "/tmp/downloaded-model.pkl"
)

# List objects with prefix
objects = client.list_objects("ml-models", prefix="production/", recursive=True)
for obj in objects:
    print(f"{obj.object_name} — {obj.size} bytes — {obj.last_modified}")

# Upload a large dataset with multipart
client.fput_object(
    "ml-datasets",
    "raw/imagenet/train.tar.gz",
    "/data/imagenet/train.tar.gz",
    part_size=50*1024*1024  # 50MB chunks
)

Using boto3 (S3-compatible):

import boto3
from botocore.client import Config

s3 = boto3.client(
    's3',
    endpoint_url='http://localhost:9000',
    aws_access_key_id='admin',
    aws_secret_access_key='change_this_strong_password',
    config=Config(signature_version='s3v4'),
    region_name='us-east-1'
)

# Upload
s3.upload_file('/local/model.pkl', 'ml-models', 'production/v2/model.pkl')

# Download
s3.download_file('ml-models', 'production/v2/model.pkl', '/tmp/model.pkl')

# Presigned URL for temporary access
url = s3.generate_presigned_url(
    'get_object',
    Params={'Bucket': 'ml-models', 'Key': 'production/v2/model.pkl'},
    ExpiresIn=3600  # 1 hour
)
print(url)

Integration with MLflow and DVC

MLflow with MinIO Artifact Store

import mlflow

mlflow.set_tracking_uri("http://localhost:5000")

# Set S3 endpoint for MinIO
import os
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "change_this_strong_password"

# Start MLflow server with MinIO artifacts:
# mlflow server \
#   --backend-store-uri sqlite:///mlflow.db \
#   --default-artifact-root s3://ml-experiments/mlflow/

DVC with MinIO Remote

pip install dvc dvc-s3

# Initialize DVC in your ML project
cd /path/to/ml-project
dvc init

# Add MinIO as a DVC remote
dvc remote add -d minio s3://ml-datasets
dvc remote modify minio endpointurl http://localhost:9000
dvc remote modify minio access_key_id admin
dvc remote modify minio secret_access_key change_this_strong_password

# Track and push datasets
dvc add data/train/
git add data/train.dvc .gitignore
git commit -m "Add training dataset"

dvc push  # Uploads to MinIO
dvc pull  # Downloads from MinIO

High-Performance Data Pipelines

import torch
from torch.utils.data import Dataset, DataLoader
from minio import Minio
import io

class MinIODataset(Dataset):
    """PyTorch Dataset that reads samples directly from MinIO."""
    
    def __init__(self, bucket, prefix, endpoint="localhost:9000"):
        self.client = Minio(
            endpoint,
            access_key="admin",
            secret_key="change_this_strong_password",
            secure=False
        )
        self.bucket = bucket
        self.objects = [
            obj.object_name
            for obj in self.client.list_objects(bucket, prefix=prefix, recursive=True)
        ]
    
    def __len__(self):
        return len(self.objects)
    
    def __getitem__(self, idx):
        obj_name = self.objects[idx]
        response = self.client.get_object(self.bucket, obj_name)
        data = torch.load(io.BytesIO(response.read()))
        response.close()
        return data

# Usage
dataset = MinIODataset("ml-datasets", "processed/imagenet/train/")
loader = DataLoader(dataset, batch_size=32, num_workers=4)

Troubleshooting

MinIO fails to start — permission denied

# Fix ownership
sudo chown -R minio-user:minio-user /data/minio

# Check disk space
df -h /data/minio

"Access Denied" from Python client

# Verify credentials
mc alias set local http://localhost:9000 admin your-password
mc ls local/

# Create an access key for applications (avoid using root credentials)
mc admin user add local app-user app-password
mc admin policy attach local readwrite --user app-user

Slow uploads from Python

# Use multipart upload for files over 100MB
client.fput_object(
    "bucket", "key", "/path/to/file",
    part_size=100*1024*1024  # 100MB parts
)

Connection refused — MinIO not accessible externally

sudo systemctl status minio
# Check which address MinIO is binding to
ss -tlnp | grep 9000

# Ensure firewall allows access
ufw allow 9000/tcp
ufw allow 9001/tcp

Conclusion

MinIO provides a self-hosted, high-performance S3-compatible storage solution that integrates seamlessly with Python ML tooling, MLflow experiment tracking, and DVC dataset versioning. Its bucket versioning capabilities ensure model lineage tracking, while the Python SDK makes it straightforward to build automated pipelines for uploading training artifacts and retrieving production models.