chore(ml): updated dockerfile, added typing, packaging (#2642)

* updated dockerfile, added typing, packaging apply env change * added arm64 support * added ml version pump, second try for arm64 * added linting config to pyproject.toml * renamed ml input field * fixed linter config * fixed dev docker compose
2025-08-09 23:17:29 +02:00 · 2023-06-05 10:40:48 -04:00
parent c92c442356
commit 1e748864c5
13 changed files with 2647 additions and 67 deletions
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -35,7 +35,7 @@ services:
    ports:
      - 3003:3003
    volumes:
-      - ../machine-learning/src:/usr/src/app
+      - ../machine-learning/app:/usr/src/app
      - ${UPLOAD_LOCATION}:/usr/src/app/upload
      - model-cache:/cache
    env_file:
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@@ -1,29 +1,26 @@
-FROM python:3.10 as builder
+FROM python:3.11 as builder
 ENV PYTHONDONTWRITEBYTECODE=1 \
  PYTHONUNBUFFERED=1 \
  PIP_NO_CACHE_DIR=true
 RUN pip install --upgrade pip && pip install poetry
 RUN poetry config installer.max-workers 10 && \
  poetry config virtualenvs.create false
 RUN python -m venv /opt/venv
-RUN /opt/venv/bin/pip install torch --index-url https://download.pytorch.org/whl/cpu
+ENV VIRTUAL_ENV="/opt/venv" PATH="/opt/venv/bin:${PATH}"
 RUN /opt/venv/bin/pip install transformers tqdm numpy scikit-learn scipy nltk sentencepiece fastapi Pillow uvicorn[standard]
 RUN /opt/venv/bin/pip install --no-deps sentence-transformers
 # Facial Recognition Stuff
 RUN /opt/venv/bin/pip install insightface onnxruntime
-FROM python:3.10-slim
+COPY poetry.lock pyproject.toml ./
 RUN poetry install --sync --no-interaction --no-ansi --no-root --only main
-ENV NODE_ENV=production
+FROM python:3.11-slim
-
+WORKDIR /usr/src/app
-COPY --from=builder /opt/venv /opt/venv
+ENV NODE_ENV=production \
-
+  TRANSFORMERS_CACHE=/cache \
 ENV TRANSFORMERS_CACHE=/cache \
  PYTHONDONTWRITEBYTECODE=1 \
  PYTHONUNBUFFERED=1 \
-  PATH="/opt/venv/bin:$PATH"
+  PATH="/opt/venv/bin:$PATH" \
  PYTHONPATH=`pwd`
-WORKDIR /usr/src/app
+COPY --from=builder /opt/venv /opt/venv
-
+COPY app .
-COPY . .
+ENTRYPOINT ["python", "main.py"]
 ENV PYTHONPATH=`pwd`
 CMD ["python", "src/main.py"]
--- a/machine-learning/README.md
+++ b/machine-learning/README.md
@@ -1,5 +1,13 @@
 # Immich Machine Learning
- Object Detection
+- Image classification
- Image Classification
+- CLIP embeddings
 - Facial recognition
 # Setup
 This project uses [Poetry](https://python-poetry.org/docs/#installation), so be sure to install it first.
 Running `poetry install --no-root --with dev` will install everything you need in an isolated virtual environment.
 To add or remove dependencies, you can use the commands `poetry add $PACKAGE_NAME` and `poetry remove $PACKAGE_NAME`, respectively.
 Be sure to commit the `poetry.lock` and `pyproject.toml` files to reflect any changes in dependencies.
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@@ -1,22 +1,23 @@
 import os
-import numpy as np
+from typing import Any
 from schemas import (
    EmbeddingResponse,
    FaceResponse,
    TagResponse,
    MessageResponse,
    TextModelRequest,
    TextResponse,
    VisionModelRequest,
 )
 import cv2 as cv
 import uvicorn
 from insightface.app import FaceAnalysis
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 from transformers import Pipeline
 from PIL import Image
 from fastapi import FastAPI
 from pydantic import BaseModel
 class MlRequestBody(BaseModel):
    thumbnailPath: str
 class ClipRequestBody(BaseModel):
    text: str
 classification_model = os.getenv(
@@ -42,7 +43,7 @@ app = FastAPI()
@app.on_event("startup")
-async def startup_event():
+async def startup_event() -> None:
    models = [
        (classification_model, "image-classification"),
        (clip_image_model, "clip"),
@@ -58,42 +59,51 @@ async def startup_event():
            _get_model(model_name, model_type)
-@app.get("/")
+@app.get("/", response_model=MessageResponse)
-async def root():
+async def root() -> dict[str, str]:
    return {"message": "Immich ML"}
-@app.get("/ping")
+@app.get("/ping", response_model=TextResponse)
-def ping():
+def ping() -> str:
    return "pong"
-@app.post("/image-classifier/tag-image", status_code=200)
+@app.post("/image-classifier/tag-image", response_model=TagResponse, status_code=200)
-def image_classification(payload: MlRequestBody):
+def image_classification(payload: VisionModelRequest) -> list[str]:
    model = get_cached_model(classification_model, "image-classification")
-    assetPath = payload.thumbnailPath
+    assetPath = payload.image_path
-    return run_engine(model, assetPath)
+    labels = run_engine(model, assetPath)
    return labels
-@app.post("/sentence-transformer/encode-image", status_code=200)
+@app.post(
-def clip_encode_image(payload: MlRequestBody):
+    "/sentence-transformer/encode-image",
    response_model=EmbeddingResponse,
    status_code=200,
 )
 def clip_encode_image(payload: VisionModelRequest) -> list[float]:
    model = get_cached_model(clip_image_model, "clip")
-    assetPath = payload.thumbnailPath
+    image = Image.open(payload.image_path)
-    return model.encode(Image.open(assetPath)).tolist()
+    return model.encode(image).tolist()
-@app.post("/sentence-transformer/encode-text", status_code=200)
+@app.post(
-def clip_encode_text(payload: ClipRequestBody):
+    "/sentence-transformer/encode-text",
    response_model=EmbeddingResponse,
    status_code=200,
 )
 def clip_encode_text(payload: TextModelRequest) -> list[float]:
    model = get_cached_model(clip_text_model, "clip")
-    text = payload.text
+    return model.encode(payload.text).tolist()
    return model.encode(text).tolist()
-@app.post("/facial-recognition/detect-faces", status_code=200)
+@app.post(
-def facial_recognition(payload: MlRequestBody):
+    "/facial-recognition/detect-faces", response_model=FaceResponse, status_code=200
 )
 def facial_recognition(payload: VisionModelRequest) -> list[dict[str, Any]]:
    model = get_cached_model(facial_recognition_model, "facial-recognition")
-    assetPath = payload.thumbnailPath
+    img = cv.imread(payload.image_path)
    img = cv.imread(assetPath)
    height, width, _ = img.shape
    results = []
    faces = model.get(img)
@@ -120,11 +130,11 @@ def facial_recognition(payload: MlRequestBody):
    return results
-def run_engine(engine, path):
+def run_engine(engine: Pipeline, path: str) -> list[str]:
-    result = []
+    result: list[str] = []
-    predictions = engine(path)
+    predictions: list[dict[str, Any]] = engine(path)  # type: ignore
-    for index, pred in enumerate(predictions):
+    for pred in predictions:
        tags = pred["label"].split(", ")
        if pred["score"] > min_tag_score:
            result = [*result, *tags]
@@ -135,7 +145,7 @@ def run_engine(engine, path):
    return result
-def get_cached_model(model, task):
+def get_cached_model(model, task) -> Any:
    global _model_cache
    key = "|".join([model, str(task)])
    if key not in _model_cache:
@@ -145,7 +155,7 @@ def get_cached_model(model, task):
    return _model_cache[key]
-def _get_model(model, task):
+def _get_model(model, task) -> Any:
    match task:
        case "facial-recognition":
            model = FaceAnalysis(
--- a/machine-learning/app/schemas.py
+++ b/machine-learning/app/schemas.py
@@ -0,0 +1,64 @@
 from pydantic import BaseModel
 def to_lower_camel(string: str) -> str:
    tokens = [
        token.capitalize() if i > 0 else token
        for i, token in enumerate(string.split("_"))
    ]
    return "".join(tokens)
 class VisionModelRequest(BaseModel):
    image_path: str
    class Config:
        alias_generator = to_lower_camel
        allow_population_by_field_name = True
 class TextModelRequest(BaseModel):
    text: str
 class TextResponse(BaseModel):
    __root__: str
 class MessageResponse(BaseModel):
    message: str
 class TagResponse(BaseModel):
    __root__: list[str]
 class Embedding(BaseModel):
    __root__: list[float]
 class EmbeddingResponse(BaseModel):
    __root__: Embedding
 class BoundingBox(BaseModel):
    x1: int
    y1: int
    x2: int
    y2: int
 class Face(BaseModel):
    image_width: int
    image_height: int
    bounding_box: BoundingBox
    score: float
    embedding: Embedding
    class Config:
        alias_generator = to_lower_camel
        allow_population_by_field_name = True
 class FaceResponse(BaseModel):
    __root__: list[Face]
--- a/machine-learning/poetry.lock
+++ b/machine-learning/poetry.lock
--- a/machine-learning/pyproject.toml
+++ b/machine-learning/pyproject.toml
@@ -0,0 +1,56 @@
 [tool.poetry]
 name = "machine-learning"
 version = "1.59.1"
 description = ""
 authors = ["Hau Tran <alex.tran1502@gmail.com>"]
 readme = "README.md"
 packages = [{include = "app"}]
 [tool.poetry.dependencies]
 python = "^3.11"
 torch = [
    {markers = "platform_machine == 'arm64' or platform_machine == 'aarch64'", version = "=2.0.1", source = "pypi"},
    {markers = "platform_machine == 'amd64' or platform_machine == 'x86_64'", version = "=2.0.1+cpu", source = "pytorch-cpu"}
 ]
 transformers = "^4.29.2"
 sentence-transformers = "^2.2.2"
 onnxruntime = "^1.15.0"
 insightface = "^0.7.3"
 opencv-python-headless = "^4.7.0.72"
 pillow = "^9.5.0"
 fastapi = "^0.95.2"
 uvicorn = {extras = ["standard"], version = "^0.22.0"}
 pydantic = "^1.10.8"
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.3.0"
 black = "^23.3.0"
 pytest = "^7.3.1"
 [[tool.poetry.source]]
 name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
 priority = "explicit"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 [tool.flake8]
 max-line-length = 120
 [tool.mypy]
 python_version = "3.11"
 plugins = "pydantic.mypy"
 follow_imports = "silent"
 warn_redundant_casts = true
 disallow_any_generics = true
 check_untyped_defs = true
 no_implicit_reexport = true
 disallow_untyped_defs = true
 [tool.pydantic-mypy]
 init_forbid_extra = true
 init_typed = true
 warn_required_dynamic_aliases = true
 warn_untyped_fields = true
--- a/misc/release/pump-version.sh
+++ b/misc/release/pump-version.sh
@@ -63,6 +63,7 @@ if [ "$CURRENT_SERVER" != "$NEXT_SERVER" ]; then
  echo "Pumping Server: $CURRENT_SERVER => $NEXT_SERVER"
  npm --prefix server version $SERVER_PUMP
  npm --prefix server run api:generate
  poetry --directory machine-learning version $SERVER_PUMP
 fi
 if [ "$CURRENT_MOBILE" != "$NEXT_MOBILE" ]; then
--- a/server/libs/domain/src/facial-recognition/facial-recognition.service.spec.ts
+++ b/server/libs/domain/src/facial-recognition/facial-recognition.service.spec.ts
@@ -175,7 +175,7 @@ describe(FacialRecognitionService.name, () => {
      assetMock.getByIds.mockResolvedValue([assetEntityStub.image]);
      await sut.handleRecognizeFaces({ id: assetEntityStub.image.id });
      expect(machineLearningMock.detectFaces).toHaveBeenCalledWith({
-        thumbnailPath: assetEntityStub.image.resizePath,
+        imagePath: assetEntityStub.image.resizePath,
      });
      expect(faceMock.create).not.toHaveBeenCalled();
      expect(jobMock.queue).not.toHaveBeenCalled();
--- a/server/libs/domain/src/facial-recognition/facial-recognition.services.ts
+++ b/server/libs/domain/src/facial-recognition/facial-recognition.services.ts
@@ -54,7 +54,7 @@ export class FacialRecognitionService {
      return false;
    }
-    const faces = await this.machineLearning.detectFaces({ thumbnailPath: asset.resizePath });
+    const faces = await this.machineLearning.detectFaces({ imagePath: asset.resizePath });
    this.logger.debug(`${faces.length} faces detected in ${asset.resizePath}`);
    this.logger.verbose(faces.map((face) => ({ ...face, embedding: `float[${face.embedding.length}]` })));
--- a/server/libs/domain/src/smart-info/machine-learning.interface.ts
+++ b/server/libs/domain/src/smart-info/machine-learning.interface.ts
@@ -1,7 +1,7 @@
 export const IMachineLearningRepository = 'IMachineLearningRepository';
 export interface MachineLearningInput {
-  thumbnailPath: string;
+  imagePath: string;
 }
 export interface BoundingBox {
--- a/server/libs/domain/src/smart-info/smart-info.service.spec.ts
+++ b/server/libs/domain/src/smart-info/smart-info.service.spec.ts
@@ -84,7 +84,7 @@ describe(SmartInfoService.name, () => {
      await sut.handleClassifyImage({ id: asset.id });
-      expect(machineMock.classifyImage).toHaveBeenCalledWith({ thumbnailPath: 'path/to/resize.ext' });
+      expect(machineMock.classifyImage).toHaveBeenCalledWith({ imagePath: 'path/to/resize.ext' });
      expect(smartMock.upsert).toHaveBeenCalledWith({
        assetId: 'asset-1',
        tags: ['tag1', 'tag2', 'tag3'],
@@ -143,7 +143,7 @@ describe(SmartInfoService.name, () => {
      await sut.handleEncodeClip({ id: asset.id });
-      expect(machineMock.encodeImage).toHaveBeenCalledWith({ thumbnailPath: 'path/to/resize.ext' });
+      expect(machineMock.encodeImage).toHaveBeenCalledWith({ imagePath: 'path/to/resize.ext' });
      expect(smartMock.upsert).toHaveBeenCalledWith({
        assetId: 'asset-1',
        clipEmbedding: [0.01, 0.02, 0.03],
--- a/server/libs/domain/src/smart-info/smart-info.service.ts
+++ b/server/libs/domain/src/smart-info/smart-info.service.ts
@@ -40,7 +40,7 @@ export class SmartInfoService {
      return false;
    }
-    const tags = await this.machineLearning.classifyImage({ thumbnailPath: asset.resizePath });
+    const tags = await this.machineLearning.classifyImage({ imagePath: asset.resizePath });
    if (tags.length === 0) {
      return false;
    }
@@ -73,7 +73,7 @@ export class SmartInfoService {
      return false;
    }
-    const clipEmbedding = await this.machineLearning.encodeImage({ thumbnailPath: asset.resizePath });
+    const clipEmbedding = await this.machineLearning.encodeImage({ imagePath: asset.resizePath });
    await this.repository.upsert({ assetId: asset.id, clipEmbedding: clipEmbedding });
    return true;