feat(ml): export clip models to ONNX and host models on Hugging Face (#4700)

* export clip models * export to hf refactored export code * export mclip, general refactoring cleanup * updated conda deps * do transforms with pillow and numpy, add tokenization config to export, general refactoring * moved conda dockerfile, re-added poetry * minor fixes * updated link * updated tests * removed `requirements.txt` from workflow * fixed mimalloc path * removed torchvision * cleaner np typing * review suggestions * update default model name * update test
2025-08-09 23:17:29 +02:00 · 2023-10-31 06:02:04 -04:00
parent 3212a47720
commit 87a0ba3db3
29 changed files with 6192 additions and 2043 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -166,7 +166,6 @@ jobs:
      - name: Install dependencies
        run: |
          poetry install --with dev
-          poetry run pip install --no-deps -r requirements.txt
      - name: Lint with ruff
        run: |
          poetry run ruff check --format=github app
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@@ -10,9 +10,8 @@ RUN poetry config installer.max-workers 10 && \
 RUN python -m venv /opt/venv
 ENV VIRTUAL_ENV="/opt/venv" PATH="/opt/venv/bin:${PATH}"

-COPY poetry.lock pyproject.toml requirements.txt ./
+COPY poetry.lock pyproject.toml ./
 RUN poetry install --sync --no-interaction --no-ansi --no-root --only main
-RUN pip install --no-deps -r requirements.txt

 FROM python:3.11-slim-bookworm

--- a/machine-learning/app/conftest.py
+++ b/machine-learning/app/conftest.py
@@ -1,5 +1,6 @@
 import json
-from typing import Any, Iterator, TypeAlias
+from pathlib import Path
+from typing import Any, Iterator
 from unittest import mock

 import numpy as np
@@ -8,8 +9,7 @@ from fastapi.testclient import TestClient
 from PIL import Image

 from .main import app, init_state
-
-ndarray: TypeAlias = np.ndarray[int, np.dtype[np.float32]]
+from .schemas import ndarray_f32


@pytest.fixture
@@ -18,13 +18,13 @@ def pil_image() -> Image.Image:


@pytest.fixture
-def cv_image(pil_image: Image.Image) -> ndarray:
+def cv_image(pil_image: Image.Image) -> ndarray_f32:
    return np.asarray(pil_image)[:, :, ::-1]  # PIL uses RGB while cv2 uses BGR


@pytest.fixture
 def mock_get_model() -> Iterator[mock.Mock]:
-    with mock.patch("app.models.cache.InferenceModel.from_model_type", autospec=True) as mocked:
+    with mock.patch("app.models.cache.from_model_type", autospec=True) as mocked:
        yield mocked


@@ -37,3 +37,25 @@ def deployed_app() -> TestClient:
@pytest.fixture(scope="session")
 def responses() -> dict[str, Any]:
    return json.load(open("responses.json", "r"))
+
+
+@pytest.fixture(scope="session")
+def clip_model_cfg() -> dict[str, Any]:
+    return {
+        "embed_dim": 512,
+        "vision_cfg": {"image_size": 224, "layers": 12, "width": 768, "patch_size": 32},
+        "text_cfg": {"context_length": 77, "vocab_size": 49408, "width": 512, "heads": 8, "layers": 12},
+    }
+
+
+@pytest.fixture(scope="session")
+def clip_preprocess_cfg() -> dict[str, Any]:
+    return {
+        "size": [224, 224],
+        "mode": "RGB",
+        "mean": [0.48145466, 0.4578275, 0.40821073],
+        "std": [0.26862954, 0.26130258, 0.27577711],
+        "interpolation": "bicubic",
+        "resize_mode": "shortest",
+        "fill_color": 0,
+    }
--- a/machine-learning/app/models/init.py
+++ b/machine-learning/app/models/init.py
@@ -1,3 +1,25 @@
-from .clip import CLIPEncoder
+from typing import Any
+
+from app.schemas import ModelType
+
+from .base import InferenceModel
+from .clip import MCLIPEncoder, OpenCLIPEncoder, is_mclip, is_openclip
 from .facial_recognition import FaceRecognizer
 from .image_classification import ImageClassifier
+
+
+def from_model_type(model_type: ModelType, model_name: str, **model_kwargs: Any) -> InferenceModel:
+    match model_type:
+        case ModelType.CLIP:
+            if is_openclip(model_name):
+                return OpenCLIPEncoder(model_name, **model_kwargs)
+            elif is_mclip(model_name):
+                return MCLIPEncoder(model_name, **model_kwargs)
+            else:
+                raise ValueError(f"Unknown CLIP model {model_name}")
+        case ModelType.FACIAL_RECOGNITION:
+            return FaceRecognizer(model_name, **model_kwargs)
+        case ModelType.IMAGE_CLASSIFICATION:
+            return ImageClassifier(model_name, **model_kwargs)
+        case _:
+            raise ValueError(f"Unknown model type {model_type}")
--- a/machine-learning/app/models/base.py
+++ b/machine-learning/app/models/base.py
@@ -25,7 +25,7 @@ class InferenceModel(ABC):
    ) -> None:
        self.model_name = model_name
        self.loaded = False
-        self._cache_dir = Path(cache_dir) if cache_dir is not None else get_cache_dir(model_name, self.model_type)
+        self._cache_dir = Path(cache_dir) if cache_dir is not None else None
        self.providers = model_kwargs.pop("providers", ["CPUExecutionProvider"])
        #  don't pre-allocate more memory than needed
        self.provider_options = model_kwargs.pop(
@@ -92,7 +92,7 @@ class InferenceModel(ABC):

    @property
    def cache_dir(self) -> Path:
-        return self._cache_dir
+        return self._cache_dir if self._cache_dir is not None else get_cache_dir(self.model_name, self.model_type)

    @cache_dir.setter
    def cache_dir(self, cache_dir: Path) -> None:
--- a/machine-learning/app/models/cache.py
+++ b/machine-learning/app/models/cache.py
@@ -4,6 +4,8 @@ from aiocache.backends.memory import SimpleMemoryCache
 from aiocache.lock import OptimisticLock
 from aiocache.plugins import BasePlugin, TimingPlugin

+from app.models import from_model_type
+
 from ..schemas import ModelType
 from .base import InferenceModel

@@ -50,7 +52,7 @@ class ModelCache:
        async with OptimisticLock(self.cache, key) as lock:
            model = await self.cache.get(key)
            if model is None:
-                model = InferenceModel.from_model_type(model_type, model_name, **model_kwargs)
+                model = from_model_type(model_type, model_name, **model_kwargs)
                await lock.cas(model, ttl=self.ttl)
        return model

--- a/machine-learning/app/models/clip.py
+++ b/machine-learning/app/models/clip.py
@@ -1,23 +1,24 @@
-import os
-import zipfile
+import json
+from abc import abstractmethod
+from functools import cached_property
 from io import BytesIO
+from pathlib import Path
 from typing import Any, Literal

+import numpy as np
 import onnxruntime as ort
-import torch
-from clip_server.model.clip import BICUBIC, _convert_image_to_rgb
-from clip_server.model.clip_onnx import _MODELS, _S3_BUCKET_V2, CLIPOnnxModel, download_model
-from clip_server.model.pretrained_models import _VISUAL_MODEL_IMAGE_SIZE
-from clip_server.model.tokenization import Tokenizer
+from huggingface_hub import snapshot_download
 from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
+from transformers import AutoTokenizer
+
+from app.config import log
+from app.models.transforms import crop, get_pil_resampling, normalize, resize, to_numpy
+from app.schemas import ModelType, ndarray_f32, ndarray_i32, ndarray_i64

-from ..config import log
-from ..schemas import ModelType
 from .base import InferenceModel


-class CLIPEncoder(InferenceModel):
+class BaseCLIPEncoder(InferenceModel):
    _model_type = ModelType.CLIP

    def __init__(
@@ -27,48 +28,29 @@ class CLIPEncoder(InferenceModel):
        mode: Literal["text", "vision"] | None = None,
        **model_kwargs: Any,
    ) -> None:
-        if mode is not None and mode not in ("text", "vision"):
-            raise ValueError(f"Mode must be 'text', 'vision', or omitted; got '{mode}'")
-        if model_name not in _MODELS:
-            raise ValueError(f"Unknown model name {model_name}.")
        self.mode = mode
        super().__init__(model_name, cache_dir, **model_kwargs)

-    def _download(self) -> None:
-        models: tuple[tuple[str, str], tuple[str, str]] = _MODELS[self.model_name]
-        text_onnx_path = self.cache_dir / "textual.onnx"
-        vision_onnx_path = self.cache_dir / "visual.onnx"
-
-        if not text_onnx_path.is_file():
-            self._download_model(*models[0])
-
-        if not vision_onnx_path.is_file():
-            self._download_model(*models[1])
-
    def _load(self) -> None:
        if self.mode == "text" or self.mode is None:
            log.debug(f"Loading clip text model '{self.model_name}'")
+
            self.text_model = ort.InferenceSession(
-                self.cache_dir / "textual.onnx",
+                self.textual_path.as_posix(),
                sess_options=self.sess_options,
                providers=self.providers,
                provider_options=self.provider_options,
            )
-            self.text_outputs = [output.name for output in self.text_model.get_outputs()]
-            self.tokenizer = Tokenizer(self.model_name)

        if self.mode == "vision" or self.mode is None:
            log.debug(f"Loading clip vision model '{self.model_name}'")
+
            self.vision_model = ort.InferenceSession(
-                self.cache_dir / "visual.onnx",
+                self.visual_path.as_posix(),
                sess_options=self.sess_options,
                providers=self.providers,
                provider_options=self.provider_options,
            )
-            self.vision_outputs = [output.name for output in self.vision_model.get_outputs()]
-
-            image_size = _VISUAL_MODEL_IMAGE_SIZE[CLIPOnnxModel.get_model_name(self.model_name)]
-            self.transform = _transform_pil_image(image_size)

    def _predict(self, image_or_text: Image.Image | str) -> list[float]:
        if isinstance(image_or_text, bytes):
@@ -78,55 +60,163 @@ class CLIPEncoder(InferenceModel):
            case Image.Image():
                if self.mode == "text":
                    raise TypeError("Cannot encode image as text-only model")
-                pixel_values = self.transform(image_or_text)
-                assert isinstance(pixel_values, torch.Tensor)
-                pixel_values = torch.unsqueeze(pixel_values, 0).numpy()
-                outputs = self.vision_model.run(self.vision_outputs, {"pixel_values": pixel_values})
+
+                outputs = self.vision_model.run(None, self.transform(image_or_text))
            case str():
                if self.mode == "vision":
                    raise TypeError("Cannot encode text as vision-only model")
-                text_inputs: dict[str, torch.Tensor] = self.tokenizer(image_or_text)
-                inputs = {
-                    "input_ids": text_inputs["input_ids"].int().numpy(),
-                    "attention_mask": text_inputs["attention_mask"].int().numpy(),
-                }
-                outputs = self.text_model.run(self.text_outputs, inputs)
+
+                outputs = self.text_model.run(None, self.tokenize(image_or_text))
            case _:
                raise TypeError(f"Expected Image or str, but got: {type(image_or_text)}")

        return outputs[0][0].tolist()

-    def _download_model(self, model_name: str, model_md5: str) -> bool:
-        # downloading logic is adapted from clip-server's CLIPOnnxModel class
-        download_model(
-            url=_S3_BUCKET_V2 + model_name,
-            target_folder=self.cache_dir.as_posix(),
-            md5sum=model_md5,
-            with_resume=True,
-        )
-        file = self.cache_dir / model_name.split("/")[1]
-        if file.suffix == ".zip":
-            with zipfile.ZipFile(file, "r") as zip_ref:
-                zip_ref.extractall(self.cache_dir)
-            os.remove(file)
-        return True
+    @abstractmethod
+    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+        pass
+
+    @abstractmethod
+    def transform(self, image: Image.Image) -> dict[str, ndarray_f32]:
+        pass
+
+    @property
+    def textual_dir(self) -> Path:
+        return self.cache_dir / "textual"
+
+    @property
+    def visual_dir(self) -> Path:
+        return self.cache_dir / "visual"
+
+    @property
+    def model_cfg_path(self) -> Path:
+        return self.cache_dir / "config.json"
+
+    @property
+    def textual_path(self) -> Path:
+        return self.textual_dir / "model.onnx"
+
+    @property
+    def visual_path(self) -> Path:
+        return self.visual_dir / "model.onnx"
+
+    @property
+    def preprocess_cfg_path(self) -> Path:
+        return self.visual_dir / "preprocess_cfg.json"

    @property
    def cached(self) -> bool:
-        return (self.cache_dir / "textual.onnx").is_file() and (self.cache_dir / "visual.onnx").is_file()
+        return self.textual_path.is_file() and self.visual_path.is_file()


-# same as `_transform_blob` without `_blob2image`
-def _transform_pil_image(n_px: int) -> Compose:
-    return Compose(
-        [
-            Resize(n_px, interpolation=BICUBIC),
-            CenterCrop(n_px),
-            _convert_image_to_rgb,
-            ToTensor(),
-            Normalize(
-                (0.48145466, 0.4578275, 0.40821073),
-                (0.26862954, 0.26130258, 0.27577711),
-            ),
-        ]
-    )
+class OpenCLIPEncoder(BaseCLIPEncoder):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        mode: Literal["text", "vision"] | None = None,
+        **model_kwargs: Any,
+    ) -> None:
+        super().__init__(_clean_model_name(model_name), cache_dir, mode, **model_kwargs)
+
+    def _download(self) -> None:
+        snapshot_download(
+            f"immich-app/{self.model_name}",
+            cache_dir=self.cache_dir,
+            local_dir=self.cache_dir,
+            local_dir_use_symlinks=False,
+        )
+
+    def _load(self) -> None:
+        super()._load()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.textual_dir)
+        self.sequence_length = self.model_cfg["text_cfg"]["context_length"]
+
+        self.size = (
+            self.preprocess_cfg["size"][0] if type(self.preprocess_cfg["size"]) == list else self.preprocess_cfg["size"]
+        )
+        self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"])
+        self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32)
+        self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32)
+
+    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+        input_ids: ndarray_i64 = self.tokenizer(
+            text,
+            max_length=self.sequence_length,
+            return_tensors="np",
+            return_attention_mask=False,
+            padding="max_length",
+            truncation=True,
+        ).input_ids
+        return {"text": input_ids.astype(np.int32)}
+
+    def transform(self, image: Image.Image) -> dict[str, ndarray_f32]:
+        image = resize(image, self.size)
+        image = crop(image, self.size)
+        image_np = to_numpy(image)
+        image_np = normalize(image_np, self.mean, self.std)
+        return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)}
+
+    @cached_property
+    def model_cfg(self) -> dict[str, Any]:
+        return json.load(self.model_cfg_path.open())
+
+    @cached_property
+    def preprocess_cfg(self) -> dict[str, Any]:
+        return json.load(self.preprocess_cfg_path.open())
+
+
+class MCLIPEncoder(OpenCLIPEncoder):
+    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+        tokens: dict[str, ndarray_i64] = self.tokenizer(text, return_tensors="np")
+        return {k: v.astype(np.int32) for k, v in tokens.items()}
+
+
+_OPENCLIP_MODELS = {
+    "RN50__openai",
+    "RN50__yfcc15m",
+    "RN50__cc12m",
+    "RN101__openai",
+    "RN101__yfcc15m",
+    "RN50x4__openai",
+    "RN50x16__openai",
+    "RN50x64__openai",
+    "ViT-B-32__openai",
+    "ViT-B-32__laion2b_e16",
+    "ViT-B-32__laion400m_e31",
+    "ViT-B-32__laion400m_e32",
+    "ViT-B-32__laion2b-s34b-b79k",
+    "ViT-B-16__openai",
+    "ViT-B-16__laion400m_e31",
+    "ViT-B-16__laion400m_e32",
+    "ViT-B-16-plus-240__laion400m_e31",
+    "ViT-B-16-plus-240__laion400m_e32",
+    "ViT-L-14__openai",
+    "ViT-L-14__laion400m_e31",
+    "ViT-L-14__laion400m_e32",
+    "ViT-L-14__laion2b-s32b-b82k",
+    "ViT-L-14-336__openai",
+    "ViT-H-14__laion2b-s32b-b79k",
+    "ViT-g-14__laion2b-s12b-b42k",
+}
+
+
+_MCLIP_MODELS = {
+    "LABSE-Vit-L-14",
+    "XLM-Roberta-Large-Vit-B-32",
+    "XLM-Roberta-Large-Vit-B-16Plus",
+    "XLM-Roberta-Large-Vit-L-14",
+}
+
+
+def _clean_model_name(model_name: str) -> str:
+    return model_name.split("/")[-1].replace("::", "__")
+
+
+def is_openclip(model_name: str) -> bool:
+    return _clean_model_name(model_name) in _OPENCLIP_MODELS
+
+
+def is_mclip(model_name: str) -> bool:
+    return _clean_model_name(model_name) in _MCLIP_MODELS
--- a/machine-learning/app/models/facial_recognition.py
+++ b/machine-learning/app/models/facial_recognition.py
@@ -9,7 +9,8 @@ from insightface.model_zoo import ArcFaceONNX, RetinaFace
 from insightface.utils.face_align import norm_crop
 from insightface.utils.storage import BASE_REPO_URL, download_file

-from ..schemas import ModelType
+from app.schemas import ModelType, ndarray_f32
+
 from .base import InferenceModel


@@ -68,7 +69,7 @@ class FaceRecognizer(InferenceModel):
        )
        self.rec_model.prepare(ctx_id=0)

-    def _predict(self, image: np.ndarray[int, np.dtype[Any]] | bytes) -> list[dict[str, Any]]:
+    def _predict(self, image: ndarray_f32 | bytes) -> list[dict[str, Any]]:
        if isinstance(image, bytes):
            image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR)
        bboxes, kpss = self.det_model.detect(image)
--- a/machine-learning/app/models/transforms.py
+++ b/machine-learning/app/models/transforms.py
@@ -0,0 +1,35 @@
+import numpy as np
+from PIL import Image
+
+from app.schemas import ndarray_f32
+
+_PIL_RESAMPLING_METHODS = {resampling.name.lower(): resampling for resampling in Image.Resampling}
+
+
+def resize(img: Image.Image, size: int) -> Image.Image:
+    if img.width < img.height:
+        return img.resize((size, int((img.height / img.width) * size)), resample=Image.BICUBIC)
+    else:
+        return img.resize((int((img.width / img.height) * size), size), resample=Image.BICUBIC)
+
+
+# https://stackoverflow.com/a/60883103
+def crop(img: Image.Image, size: int) -> Image.Image:
+    left = int((img.size[0] / 2) - (size / 2))
+    upper = int((img.size[1] / 2) - (size / 2))
+    right = left + size
+    lower = upper + size
+
+    return img.crop((left, upper, right, lower))
+
+
+def to_numpy(img: Image.Image) -> ndarray_f32:
+    return np.asarray(img.convert("RGB")).astype(np.float32) / 255.0
+
+
+def normalize(img: ndarray_f32, mean: float | ndarray_f32, std: float | ndarray_f32) -> ndarray_f32:
+    return (img - mean) / std
+
+
+def get_pil_resampling(resample: str) -> Image.Resampling:
+    return _PIL_RESAMPLING_METHODS[resample.lower()]
--- a/machine-learning/app/schemas.py
+++ b/machine-learning/app/schemas.py
@@ -1,5 +1,7 @@
 from enum import StrEnum
+from typing import TypeAlias

+import numpy as np
 from pydantic import BaseModel


@@ -31,3 +33,8 @@ class ModelType(StrEnum):
    IMAGE_CLASSIFICATION = "image-classification"
    CLIP = "clip"
    FACIAL_RECOGNITION = "facial-recognition"
+
+
+ndarray_f32: TypeAlias = np.ndarray[int, np.dtype[np.float32]]
+ndarray_i64: TypeAlias = np.ndarray[int, np.dtype[np.int64]]
+ndarray_i32: TypeAlias = np.ndarray[int, np.dtype[np.int32]]
--- a/machine-learning/app/test_main.py
+++ b/machine-learning/app/test_main.py
@@ -1,7 +1,8 @@
 import json
 import pickle
 from io import BytesIO
-from typing import Any, TypeAlias
+from pathlib import Path
+from typing import Any, Callable
 from unittest import mock

 import cv2
@@ -14,13 +15,11 @@ from pytest_mock import MockerFixture
 from .config import settings
 from .models.base import PicklableSessionOptions
 from .models.cache import ModelCache
-from .models.clip import CLIPEncoder
+from .models.clip import OpenCLIPEncoder
 from .models.facial_recognition import FaceRecognizer
 from .models.image_classification import ImageClassifier
 from .schemas import ModelType

-ndarray: TypeAlias = np.ndarray[int, np.dtype[np.float32]]
-

 class TestImageClassifier:
    classifier_preds = [
@@ -56,30 +55,50 @@ class TestImageClassifier:

 class TestCLIP:
    embedding = np.random.rand(512).astype(np.float32)
+    cache_dir = Path("test_cache")

-    def test_basic_image(self, pil_image: Image.Image, mocker: MockerFixture) -> None:
-        mocker.patch.object(CLIPEncoder, "download")
+    def test_basic_image(
+        self,
+        pil_image: Image.Image,
+        mocker: MockerFixture,
+        clip_model_cfg: dict[str, Any],
+        clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
+    ) -> None:
+        mocker.patch.object(OpenCLIPEncoder, "download")
+        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
+        mocker.patch("app.models.clip.AutoTokenizer.from_pretrained", autospec=True)
        mocked = mocker.patch("app.models.clip.ort.InferenceSession", autospec=True)
        mocked.return_value.run.return_value = [[self.embedding]]
-        clip_encoder = CLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="vision")
-        assert clip_encoder.mode == "vision"
+
+        clip_encoder = OpenCLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="vision")
        embedding = clip_encoder.predict(pil_image)

+        assert clip_encoder.mode == "vision"
        assert isinstance(embedding, list)
-        assert len(embedding) == 512
+        assert len(embedding) == clip_model_cfg["embed_dim"]
        assert all([isinstance(num, float) for num in embedding])
        clip_encoder.vision_model.run.assert_called_once()

-    def test_basic_text(self, mocker: MockerFixture) -> None:
-        mocker.patch.object(CLIPEncoder, "download")
+    def test_basic_text(
+        self,
+        mocker: MockerFixture,
+        clip_model_cfg: dict[str, Any],
+        clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
+    ) -> None:
+        mocker.patch.object(OpenCLIPEncoder, "download")
+        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
+        mocker.patch("app.models.clip.AutoTokenizer.from_pretrained", autospec=True)
        mocked = mocker.patch("app.models.clip.ort.InferenceSession", autospec=True)
        mocked.return_value.run.return_value = [[self.embedding]]
-        clip_encoder = CLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="text")
-        assert clip_encoder.mode == "text"
+
+        clip_encoder = OpenCLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="text")
        embedding = clip_encoder.predict("test search query")

+        assert clip_encoder.mode == "text"
        assert isinstance(embedding, list)
-        assert len(embedding) == 512
+        assert len(embedding) == clip_model_cfg["embed_dim"]
        assert all([isinstance(num, float) for num in embedding])
        clip_encoder.text_model.run.assert_called_once()

--- a/machine-learning/export/Dockerfile
+++ b/machine-learning/export/Dockerfile
@@ -0,0 +1,21 @@
+FROM mambaorg/micromamba:bookworm-slim as builder
+
+ENV NODE_ENV=production \
+  TRANSFORMERS_CACHE=/cache \
+  PYTHONDONTWRITEBYTECODE=1 \
+  PYTHONUNBUFFERED=1 \
+  PATH="/opt/venv/bin:$PATH" \
+  PYTHONPATH=/usr/src
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER conda-lock.yml /tmp/conda-lock.yml
+RUN micromamba install -y -n base -f /tmp/conda-lock.yml && \
+    micromamba remove -y -n base cxx-compiler && \
+    micromamba clean --all --yes
+
+WORKDIR /usr/src/app
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER start.sh .
+COPY --chown=$MAMBA_USER:$MAMBA_USER app .
+
+ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
+CMD ["./start.sh"]
--- a/machine-learning/export/conda-lock.yml
+++ b/machine-learning/export/conda-lock.yml
--- a/machine-learning/export/env.dev.yaml
+++ b/machine-learning/export/env.dev.yaml
@@ -0,0 +1,15 @@
+name: base
+channels:
+  - conda-forge
+platforms:
+  - linux-64
+  - linux-aarch64
+dependencies:
+  - black
+  - conda-lock
+  - mypy
+  - pytest
+  - pytest-cov
+  - pytest-mock
+  - ruff
+category: dev
--- a/machine-learning/export/env.yaml
+++ b/machine-learning/export/env.yaml
@@ -0,0 +1,25 @@
+name: base
+channels:
+  - conda-forge
+  - nvidia
+  - pytorch-nightly
+platforms:
+  - linux-64
+dependencies:
+  - cxx-compiler
+  - onnx==1.*
+  - onnxruntime==1.*
+  - open-clip-torch==2.*
+  - orjson==3.*
+  - pip
+  - python==3.11.*
+  - pytorch
+  - rich==13.*
+  - safetensors==0.*
+  - setuptools==68.*
+  - torchvision
+  - transformers==4.*
+  - pip:
+    - multilingual-clip
+    - onnx-simplifier
+category: main
--- a/machine-learning/export/models/init.py
+++ b/machine-learning/export/models/init.py
--- a/machine-learning/export/models/mclip.py
+++ b/machine-learning/export/models/mclip.py
@@ -0,0 +1,67 @@
+import tempfile
+import warnings
+from pathlib import Path
+
+import torch
+from multilingual_clip.pt_multilingual_clip import MultilingualCLIP
+from transformers import AutoTokenizer
+
+from .openclip import OpenCLIPModelConfig
+from .openclip import to_onnx as openclip_to_onnx
+from .optimize import optimize
+from .util import get_model_path
+
+_MCLIP_TO_OPENCLIP = {
+    "M-CLIP/XLM-Roberta-Large-Vit-B-32": OpenCLIPModelConfig("ViT-B-32", "openai"),
+    "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus": OpenCLIPModelConfig("ViT-B-16-plus-240", "laion400m_e32"),
+    "M-CLIP/LABSE-Vit-L-14": OpenCLIPModelConfig("ViT-L-14", "openai"),
+    "M-CLIP/XLM-Roberta-Large-Vit-L-14": OpenCLIPModelConfig("ViT-L-14", "openai"),
+}
+
+
+def to_onnx(
+    model_name: str,
+    output_dir_visual: Path | str,
+    output_dir_textual: Path | str,
+) -> None:
+    textual_path = get_model_path(output_dir_textual)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model = MultilingualCLIP.from_pretrained(model_name, cache_dir=tmpdir)
+        AutoTokenizer.from_pretrained(model_name).save_pretrained(output_dir_textual)
+
+        for param in model.parameters():
+            param.requires_grad_(False)
+
+        export_text_encoder(model, textual_path)
+        openclip_to_onnx(_MCLIP_TO_OPENCLIP[model_name], output_dir_visual)
+        optimize(textual_path)
+
+
+def export_text_encoder(model: MultilingualCLIP, output_path: Path | str) -> None:
+    output_path = Path(output_path)
+
+    def forward(self: MultilingualCLIP, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        embs = self.transformer(input_ids, attention_mask)[0]
+        embs = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
+        embs = self.LinearTransformation(embs)
+        return torch.nn.functional.normalize(embs, dim=-1)
+
+    # unfortunately need to monkeypatch for tracing to work here
+    # otherwise it hits the 2GiB protobuf serialization limit
+    MultilingualCLIP.forward = forward
+
+    args = (torch.ones(1, 77, dtype=torch.int32), torch.ones(1, 77, dtype=torch.int32))
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        torch.onnx.export(
+            model,
+            args,
+            output_path.as_posix(),
+            input_names=["input_ids", "attention_mask"],
+            output_names=["text_embedding"],
+            opset_version=17,
+            dynamic_axes={
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
+                "attention_mask": {0: "batch_size", 1: "sequence_length"},
+            },
+        )
--- a/machine-learning/export/models/openclip.py
+++ b/machine-learning/export/models/openclip.py
@@ -0,0 +1,109 @@
+import tempfile
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import open_clip
+import torch
+from transformers import AutoTokenizer
+
+from .optimize import optimize
+from .util import get_model_path, save_config
+
+
+@dataclass
+class OpenCLIPModelConfig:
+    name: str
+    pretrained: str
+    image_size: int = field(init=False)
+    sequence_length: int = field(init=False)
+
+    def __post_init__(self) -> None:
+        open_clip_cfg = open_clip.get_model_config(self.name)
+        if open_clip_cfg is None:
+            raise ValueError(f"Unknown model {self.name}")
+        self.image_size = open_clip_cfg["vision_cfg"]["image_size"]
+        self.sequence_length = open_clip_cfg["text_cfg"]["context_length"]
+
+
+def to_onnx(
+    model_cfg: OpenCLIPModelConfig,
+    output_dir_visual: Path | str | None = None,
+    output_dir_textual: Path | str | None = None,
+) -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model = open_clip.create_model(
+            model_cfg.name,
+            pretrained=model_cfg.pretrained,
+            jit=False,
+            cache_dir=tmpdir,
+            require_pretrained=True,
+        )
+
+        text_vision_cfg = open_clip.get_model_config(model_cfg.name)
+
+        for param in model.parameters():
+            param.requires_grad_(False)
+
+        if output_dir_visual is not None:
+            output_dir_visual = Path(output_dir_visual)
+            visual_path = get_model_path(output_dir_visual)
+
+            save_config(open_clip.get_model_preprocess_cfg(model), output_dir_visual / "preprocess_cfg.json")
+            save_config(text_vision_cfg, output_dir_visual.parent / "config.json")
+            export_image_encoder(model, model_cfg, visual_path)
+
+            optimize(visual_path)
+
+        if output_dir_textual is not None:
+            output_dir_textual = Path(output_dir_textual)
+            textual_path = get_model_path(output_dir_textual)
+
+            tokenizer_name = text_vision_cfg["text_cfg"].get("hf_tokenizer_name", "openai/clip-vit-base-patch32")
+            AutoTokenizer.from_pretrained(tokenizer_name).save_pretrained(output_dir_textual)
+            export_text_encoder(model, model_cfg, textual_path)
+            optimize(textual_path)
+
+
+def export_image_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None:
+    output_path = Path(output_path)
+
+    def encode_image(image: torch.Tensor) -> torch.Tensor:
+        return model.encode_image(image, normalize=True)
+
+    args = (torch.randn(1, 3, model_cfg.image_size, model_cfg.image_size),)
+    traced = torch.jit.trace(encode_image, args)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        torch.onnx.export(
+            traced,
+            args,
+            output_path.as_posix(),
+            input_names=["image"],
+            output_names=["image_embedding"],
+            opset_version=17,
+            dynamic_axes={"image": {0: "batch_size"}},
+        )
+
+
+def export_text_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None:
+    output_path = Path(output_path)
+
+    def encode_text(text: torch.Tensor) -> torch.Tensor:
+        return model.encode_text(text, normalize=True)
+
+    args = (torch.ones(1, model_cfg.sequence_length, dtype=torch.int32),)
+    traced = torch.jit.trace(encode_text, args)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        torch.onnx.export(
+            traced,
+            args,
+            output_path.as_posix(),
+            input_names=["text"],
+            output_names=["text_embedding"],
+            opset_version=17,
+            dynamic_axes={"text": {0: "batch_size"}},
+        )
--- a/machine-learning/export/models/optimize.py
+++ b/machine-learning/export/models/optimize.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+import onnx
+import onnxruntime as ort
+import onnxsim
+
+
+def optimize_onnxsim(model_path: Path | str, output_path: Path | str) -> None:
+    model_path = Path(model_path)
+    output_path = Path(output_path)
+    model = onnx.load(model_path.as_posix())
+    model, check = onnxsim.simplify(model, skip_shape_inference=True)
+    assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model, output_path.as_posix())
+
+
+def optimize_ort(
+    model_path: Path | str,
+    output_path: Path | str,
+    level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+) -> None:
+    model_path = Path(model_path)
+    output_path = Path(output_path)
+
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = level
+    sess_options.optimized_model_filepath = output_path.as_posix()
+
+    ort.InferenceSession(model_path.as_posix(), providers=["CPUExecutionProvider"], sess_options=sess_options)
+
+
+def optimize(model_path: Path | str) -> None:
+    model_path = Path(model_path)
+
+    optimize_ort(model_path, model_path)
+    # onnxsim serializes large models as a blob, which uses much more memory when loading the model at runtime
+    if not any(file.name.startswith("Constant") for file in model_path.parent.iterdir()):
+        optimize_onnxsim(model_path, model_path)
--- a/machine-learning/export/models/util.py
+++ b/machine-learning/export/models/util.py
@@ -0,0 +1,15 @@
+import json
+from pathlib import Path
+from typing import Any
+
+
+def get_model_path(output_dir: Path | str) -> Path:
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    return output_dir / "model.onnx"
+
+
+def save_config(config: Any, output_path: Path | str) -> None:
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    json.dump(config, output_path.open("w"))
--- a/machine-learning/export/run.py
+++ b/machine-learning/export/run.py
@@ -0,0 +1,76 @@
+import gc
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from huggingface_hub import create_repo, login, upload_folder
+from models import mclip, openclip
+from rich.progress import Progress
+
+models = [
+    "RN50::openai",
+    "RN50::yfcc15m",
+    "RN50::cc12m",
+    "RN101::openai",
+    "RN101::yfcc15m",
+    "RN50x4::openai",
+    "RN50x16::openai",
+    "RN50x64::openai",
+    "ViT-B-32::openai",
+    "ViT-B-32::laion2b_e16",
+    "ViT-B-32::laion400m_e31",
+    "ViT-B-32::laion400m_e32",
+    "ViT-B-32::laion2b-s34b-b79k",
+    "ViT-B-16::openai",
+    "ViT-B-16::laion400m_e31",
+    "ViT-B-16::laion400m_e32",
+    "ViT-B-16-plus-240::laion400m_e31",
+    "ViT-B-16-plus-240::laion400m_e32",
+    "ViT-L-14::openai",
+    "ViT-L-14::laion400m_e31",
+    "ViT-L-14::laion400m_e32",
+    "ViT-L-14::laion2b-s32b-b82k",
+    "ViT-L-14-336::openai",
+    "ViT-H-14::laion2b-s32b-b79k",
+    "ViT-g-14::laion2b-s12b-b42k",
+    "M-CLIP/LABSE-Vit-L-14",
+    "M-CLIP/XLM-Roberta-Large-Vit-B-32",
+    "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus",
+    "M-CLIP/XLM-Roberta-Large-Vit-L-14",
+]
+
+login(token=os.environ["HF_AUTH_TOKEN"])
+
+with Progress() as progress:
+    task1 = progress.add_task("[green]Exporting models...", total=len(models))
+    task2 = progress.add_task("[yellow]Uploading models...", total=len(models))
+
+    with TemporaryDirectory() as tmp:
+        tmpdir = Path(tmp)
+        for model in models:
+            model_name = model.split("/")[-1].replace("::", "__")
+            config_path = tmpdir / model_name / "config.json"
+
+            def upload() -> None:
+                progress.update(task2, description=f"[yellow]Uploading {model_name}")
+                repo_id = f"immich-app/{model_name}"
+
+                create_repo(repo_id, exist_ok=True)
+                upload_folder(repo_id=repo_id, folder_path=tmpdir / model_name)
+                progress.update(task2, advance=1)
+
+            def export() -> None:
+                progress.update(task1, description=f"[green]Exporting {model_name}")
+                visual_dir = tmpdir / model_name / "visual"
+                textual_dir = tmpdir / model_name / "textual"
+                if model.startswith("M-CLIP"):
+                    mclip.to_onnx(model, visual_dir, textual_dir)
+                else:
+                    name, _, pretrained = model_name.partition("__")
+                    openclip.to_onnx(openclip.OpenCLIPModelConfig(name, pretrained), visual_dir, textual_dir)
+
+                progress.update(task1, advance=1)
+                gc.collect()
+
+            export()
+            upload()
--- a/machine-learning/locustfile.py
+++ b/machine-learning/locustfile.py
@@ -1,11 +1,12 @@
-from io import BytesIO
 import json
+from argparse import ArgumentParser
+from io import BytesIO
 from typing import Any

 from locust import HttpUser, events, task
 from locust.env import Environment
 from PIL import Image
-from argparse import ArgumentParser
+
 byte_image = BytesIO()


@@ -14,11 +15,21 @@ def _(parser: ArgumentParser) -> None:
    parser.add_argument("--tag-model", type=str, default="microsoft/resnet-50")
    parser.add_argument("--clip-model", type=str, default="ViT-B-32::openai")
    parser.add_argument("--face-model", type=str, default="buffalo_l")
-    parser.add_argument("--tag-min-score", type=int, default=0.0, 
-                        help="Returns all tags at or above this score. The default returns all tags.")
-    parser.add_argument("--face-min-score", type=int, default=0.034, 
-                        help=("Returns all faces at or above this score. The default returns 1 face per request; "
-                              "setting this to 0 blows up the number of faces to the thousands."))
+    parser.add_argument(
+        "--tag-min-score",
+        type=int,
+        default=0.0,
+        help="Returns all tags at or above this score. The default returns all tags.",
+    )
+    parser.add_argument(
+        "--face-min-score",
+        type=int,
+        default=0.034,
+        help=(
+            "Returns all faces at or above this score. The default returns 1 face per request; "
+            "setting this to 0 blows up the number of faces to the thousands."
+        ),
+    )
    parser.add_argument("--image-size", type=int, default=1000)


@@ -62,7 +73,7 @@ class CLIPTextFormDataLoadTest(InferenceLoadTest):
            ("modelName", self.environment.parsed_options.clip_model),
            ("modelType", "clip"),
            ("options", json.dumps({"mode": "text"})),
-            ("text", "test search query")
+            ("text", "test search query"),
        ]
        self.client.post("/predict", data=data)

@@ -88,5 +99,5 @@ class RecognitionFormDataLoadTest(InferenceLoadTest):
            ("options", json.dumps({"minScore": self.environment.parsed_options.face_min_score})),
        ]
        files = {"image": self.data}
-            
+
        self.client.post("/predict", data=data, files=files)
--- a/machine-learning/poetry.lock
+++ b/machine-learning/poetry.lock
--- a/machine-learning/pyproject.toml
+++ b/machine-learning/pyproject.toml
@@ -9,8 +9,8 @@ packages = [{include = "app"}]
 [tool.poetry.dependencies]
 python = "^3.11"
 torch = [
-    {markers = "platform_machine == 'arm64' or platform_machine == 'aarch64'", version = "=2.0.1", source = "pypi"},
-    {markers = "platform_machine == 'amd64' or platform_machine == 'x86_64'", version = "=2.0.1", source = "pytorch-cpu"}
+    {markers = "platform_machine == 'arm64' or platform_machine == 'aarch64'", version = "=2.1.0", source = "pypi"},
+    {markers = "platform_machine == 'amd64' or platform_machine == 'x86_64'", version = "=2.1.0", source = "pytorch-cpu"}
 ]
 transformers = "^4.29.2"
 onnxruntime = "^1.15.0"
@@ -22,14 +22,9 @@ uvicorn = {extras = ["standard"], version = "^0.22.0"}
 pydantic = "^1.10.8"
 aiocache = "^0.12.1"
 optimum = "^1.9.1"
-torchvision = [
-    {markers = "platform_machine == 'arm64' or platform_machine == 'aarch64'", version = "=0.15.2", source = "pypi"},
-    {markers = "platform_machine == 'amd64' or platform_machine == 'x86_64'", version = "=0.15.2", source = "pytorch-cpu"}
-]
 rich = "^13.4.2"
 ftfy = "^6.1.1"
 setuptools = "^68.0.0"
-open-clip-torch = "^2.20.0"
 python-multipart = "^0.0.6"
 orjson = "^3.9.5"
 safetensors = "0.3.2"
@@ -63,6 +58,7 @@ warn_redundant_casts = true
 disallow_any_generics = true
 check_untyped_defs = true
 disallow_untyped_defs = true
+ignore_missing_imports = true

 [tool.pydantic-mypy]
 init_forbid_extra = true
@@ -70,30 +66,6 @@ init_typed = true
 warn_required_dynamic_aliases = true
 warn_untyped_fields = true

-[[tool.mypy.overrides]]
-module = [
-    "huggingface_hub",
-    "transformers",
-    "gunicorn",
-    "cv2",
-    "insightface.model_zoo",
-    "insightface.utils.face_align",
-    "insightface.utils.storage",
-    "onnxruntime",
-    "optimum",
-    "optimum.pipelines",
-    "optimum.onnxruntime",
-    "clip_server.model.clip",
-    "clip_server.model.clip_onnx",
-    "clip_server.model.pretrained_models",
-    "clip_server.model.tokenization",
-    "torchvision.transforms",
-    "aiocache.backends.memory",
-    "aiocache.lock",
-    "aiocache.plugins"
-]
-ignore_missing_imports = true
-
 [tool.ruff]
 line-length = 120
 target-version = "py311"
--- a/machine-learning/requirements.txt
+++ b/machine-learning/requirements.txt
@@ -1,2 +0,0 @@
-# requirements to be installed with `--no-deps` flag
-clip-server==0.8.*
--- a/server/src/domain/smart-info/smart-info.service.spec.ts
+++ b/server/src/domain/smart-info/smart-info.service.spec.ts
@@ -193,7 +193,7 @@ describe(SmartInfoService.name, () => {
      expect(machineMock.encodeImage).toHaveBeenCalledWith(
        'http://immich-machine-learning:3003',
        { imagePath: 'path/to/resize.ext' },
-        { enabled: true, modelName: 'ViT-B-32::openai' },
+        { enabled: true, modelName: 'ViT-B-32__openai' },
      );
      expect(smartMock.upsert).toHaveBeenCalledWith({
        assetId: 'asset-1',
--- a/server/src/domain/system-config/system-config.core.ts
+++ b/server/src/domain/system-config/system-config.core.ts
@@ -67,7 +67,7 @@ export const defaults = Object.freeze<SystemConfig>({
    },
    clip: {
      enabled: true,
-      modelName: 'ViT-B-32::openai',
+      modelName: 'ViT-B-32__openai',
    },
    facialRecognition: {
      enabled: true,
--- a/server/src/domain/system-config/system-config.service.spec.ts
+++ b/server/src/domain/system-config/system-config.service.spec.ts
@@ -68,7 +68,7 @@ const updatedConfig = Object.freeze<SystemConfig>({
    },
    clip: {
      enabled: true,
-      modelName: 'ViT-B-32::openai',
+      modelName: 'ViT-B-32__openai',
    },
    facialRecognition: {
      enabled: true,
--- a/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte
+++ b/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte
@@ -140,9 +140,8 @@
              isEdited={machineLearningConfig.clip.modelName !== savedConfig.clip.modelName}
            >
              <p slot="desc" class="immich-form-label pb-2 text-sm">
-                The name of a CLIP model listed <a
-                  href="https://clip-as-service.jina.ai/user-guides/benchmark/#size-and-efficiency"><u>here</u></a
-                >. Note that you must re-run the 'Encode CLIP' job for all images upon changing a model.
+                The name of a CLIP model listed <a href="https://huggingface.co/immich-app"><u>here</u></a>. Note that
+                you must re-run the 'Encode CLIP' job for all images upon changing a model.
              </p>
            </SettingInputField>
          </div>