From 34201be74c613d8b2be737e692b3b5f8e47151bd Mon Sep 17 00:00:00 2001 From: Zeeshan Khan Date: Sat, 17 Jun 2023 22:49:19 -0500 Subject: [PATCH] feat(ml) backend takes image over HTTP (#2783) * using pydantic BaseSetting * ML API takes image file as input * keeping image in memory * reducing duplicate code * using bytes instead of UploadFile & other small code improvements * removed form-multipart, using HTTP body * format code --------- Co-authored-by: Alex Tran --- docker/docker-compose.dev.yml | 1 - docker/docker-compose.yml | 1 - machine-learning/app/config.py | 22 ++++ machine-learning/app/main.py | 117 +++++++++--------- machine-learning/app/models.py | 20 +-- machine-learning/app/schemas.py | 8 -- machine-learning/poetry.lock | 16 ++- .../machine-learning.repository.ts | 11 +- 8 files changed, 116 insertions(+), 80 deletions(-) create mode 100644 machine-learning/app/config.py diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index a596c5e4e0..101017bc1f 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -36,7 +36,6 @@ services: - 3003:3003 volumes: - ../machine-learning/app:/usr/src/app - - ${UPLOAD_LOCATION}:/usr/src/app/upload - model-cache:/cache env_file: - .env diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 160dd2ace1..af0848abcd 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -33,7 +33,6 @@ services: container_name: immich_machine_learning image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release} volumes: - - ${UPLOAD_LOCATION}:/usr/src/app/upload - model-cache:/cache env_file: - .env diff --git a/machine-learning/app/config.py b/machine-learning/app/config.py new file mode 100644 index 0000000000..f6ce64e757 --- /dev/null +++ b/machine-learning/app/config.py @@ -0,0 +1,22 @@ +from pydantic import BaseSettings + +class Settings(BaseSettings): + cache_folder: str = "/cache" + classification_model: str = "microsoft/resnet-50" + clip_image_model: str = "clip-ViT-B-32" + clip_text_model: str = "clip-ViT-B-32" + facial_recognition_model: str = "buffalo_l" + min_tag_score: float = 0.9 + eager_startup: bool = True + model_ttl: int = 300 + host: str = "0.0.0.0" + port: int = 3003 + workers: int = 1 + min_face_score: float = 0.7 + + class Config(BaseSettings.Config): + env_prefix = 'MACHINE_LEARNING_' + case_sensitive = False + + +settings = Settings() diff --git a/machine-learning/app/main.py b/machine-learning/app/main.py index cccf9d1898..cb65caaa46 100644 --- a/machine-learning/app/main.py +++ b/machine-learning/app/main.py @@ -1,4 +1,5 @@ import os +import io from typing import Any from cache import ModelCache @@ -9,52 +10,44 @@ from schemas import ( MessageResponse, TextModelRequest, TextResponse, - VisionModelRequest, ) import uvicorn - from PIL import Image -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Depends, Body from models import get_model, run_classification, run_facial_recognition - -classification_model = os.getenv( - "MACHINE_LEARNING_CLASSIFICATION_MODEL", "microsoft/resnet-50" -) -clip_image_model = os.getenv("MACHINE_LEARNING_CLIP_IMAGE_MODEL", "clip-ViT-B-32") -clip_text_model = os.getenv("MACHINE_LEARNING_CLIP_TEXT_MODEL", "clip-ViT-B-32") -facial_recognition_model = os.getenv( - "MACHINE_LEARNING_FACIAL_RECOGNITION_MODEL", "buffalo_l" -) - -min_tag_score = float(os.getenv("MACHINE_LEARNING_MIN_TAG_SCORE", 0.9)) -eager_startup = ( - os.getenv("MACHINE_LEARNING_EAGER_STARTUP", "true") == "true" -) # loads all models at startup -model_ttl = int(os.getenv("MACHINE_LEARNING_MODEL_TTL", 300)) +from config import settings _model_cache = None + app = FastAPI() @app.on_event("startup") async def startup_event() -> None: global _model_cache - _model_cache = ModelCache(ttl=model_ttl, revalidate=True) + _model_cache = ModelCache(ttl=settings.model_ttl, revalidate=True) models = [ - (classification_model, "image-classification"), - (clip_image_model, "clip"), - (clip_text_model, "clip"), - (facial_recognition_model, "facial-recognition"), + (settings.classification_model, "image-classification"), + (settings.clip_image_model, "clip"), + (settings.clip_text_model, "clip"), + (settings.facial_recognition_model, "facial-recognition"), ] # Get all models for model_name, model_type in models: - if eager_startup: + if settings.eager_startup: await _model_cache.get_cached_model(model_name, model_type) else: get_model(model_name, model_type) +def dep_model_cache(): + if _model_cache is None: + raise HTTPException(status_code=500, detail="Unable to load model.") + +def dep_input_image(image: bytes = Body(...)) -> Image: + return Image.open(io.BytesIO(image)) + @app.get("/", response_model=MessageResponse) async def root() -> dict[str, str]: return {"message": "Immich ML"} @@ -65,29 +58,36 @@ def ping() -> str: return "pong" -@app.post("/image-classifier/tag-image", response_model=TagResponse, status_code=200) -async def image_classification(payload: VisionModelRequest) -> list[str]: - if _model_cache is None: - raise HTTPException(status_code=500, detail="Unable to load model.") - - model = await _model_cache.get_cached_model( - classification_model, "image-classification" - ) - labels = run_classification(model, payload.image_path, min_tag_score) - return labels +@app.post( + "/image-classifier/tag-image", + response_model=TagResponse, + status_code=200, + dependencies=[Depends(dep_model_cache)], +) +async def image_classification( + image: Image = Depends(dep_input_image) +) -> list[str]: + try: + model = await _model_cache.get_cached_model( + settings.classification_model, "image-classification" + ) + labels = run_classification(model, image, settings.min_tag_score) + except Exception as ex: + raise HTTPException(status_code=500, detail=str(ex)) + else: + return labels @app.post( "/sentence-transformer/encode-image", response_model=EmbeddingResponse, status_code=200, + dependencies=[Depends(dep_model_cache)], ) -async def clip_encode_image(payload: VisionModelRequest) -> list[float]: - if _model_cache is None: - raise HTTPException(status_code=500, detail="Unable to load model.") - - model = await _model_cache.get_cached_model(clip_image_model, "clip") - image = Image.open(payload.image_path) +async def clip_encode_image( + image: Image = Depends(dep_input_image) +) -> list[float]: + model = await _model_cache.get_cached_model(settings.clip_image_model, "clip") embedding = model.encode(image).tolist() return embedding @@ -96,33 +96,38 @@ async def clip_encode_image(payload: VisionModelRequest) -> list[float]: "/sentence-transformer/encode-text", response_model=EmbeddingResponse, status_code=200, + dependencies=[Depends(dep_model_cache)], ) -async def clip_encode_text(payload: TextModelRequest) -> list[float]: - if _model_cache is None: - raise HTTPException(status_code=500, detail="Unable to load model.") - - model = await _model_cache.get_cached_model(clip_text_model, "clip") +async def clip_encode_text( + payload: TextModelRequest +) -> list[float]: + model = await _model_cache.get_cached_model(settings.clip_text_model, "clip") embedding = model.encode(payload.text).tolist() return embedding @app.post( - "/facial-recognition/detect-faces", response_model=FaceResponse, status_code=200 + "/facial-recognition/detect-faces", + response_model=FaceResponse, + status_code=200, + dependencies=[Depends(dep_model_cache)], ) -async def facial_recognition(payload: VisionModelRequest) -> list[dict[str, Any]]: - if _model_cache is None: - raise HTTPException(status_code=500, detail="Unable to load model.") - +async def facial_recognition( + image: bytes = Body(...), +) -> list[dict[str, Any]]: model = await _model_cache.get_cached_model( - facial_recognition_model, "facial-recognition" + settings.facial_recognition_model, "facial-recognition" ) - faces = run_facial_recognition(model, payload.image_path) + faces = run_facial_recognition(model, image) return faces if __name__ == "__main__": - host = os.getenv("MACHINE_LEARNING_HOST", "0.0.0.0") - port = int(os.getenv("MACHINE_LEARNING_PORT", 3003)) is_dev = os.getenv("NODE_ENV") == "development" - - uvicorn.run("main:app", host=host, port=port, reload=is_dev, workers=1) + uvicorn.run( + "main:app", + host=settings.host, + port=settings.port, + reload=is_dev, + workers=settings.workers, + ) diff --git a/machine-learning/app/models.py b/machine-learning/app/models.py index ed7c4bf785..04bd3b70be 100644 --- a/machine-learning/app/models.py +++ b/machine-learning/app/models.py @@ -1,14 +1,15 @@ import torch from insightface.app import FaceAnalysis from pathlib import Path -import os from transformers import pipeline, Pipeline from sentence_transformers import SentenceTransformer -from typing import Any +from typing import Any, BinaryIO import cv2 as cv +import numpy as np +from PIL import Image +from config import settings -cache_folder = os.getenv("MACHINE_LEARNING_CACHE_FOLDER", "/cache") device = "cuda" if torch.cuda.is_available() else "cpu" @@ -49,9 +50,9 @@ def get_model(model_name: str, model_type: str, **model_kwargs): def run_classification( - model: Pipeline, image_path: str, min_score: float | None = None + model: Pipeline, image: Image, min_score: float | None = None ): - predictions: list[dict[str, Any]] = model(image_path) # type: ignore + predictions: list[dict[str, Any]] = model(image) # type: ignore result = { tag for pred in predictions @@ -63,9 +64,10 @@ def run_classification( def run_facial_recognition( - model: FaceAnalysis, image_path: str + model: FaceAnalysis, image: bytes ) -> list[dict[str, Any]]: - img = cv.imread(image_path) + file_bytes = np.frombuffer(image, dtype=np.uint8) + img = cv.imdecode(file_bytes, cv.IMREAD_COLOR) height, width, _ = img.shape results = [] faces = model.get(img) @@ -101,7 +103,7 @@ def _load_facial_recognition( if isinstance(cache_dir, Path): cache_dir = cache_dir.as_posix() if min_face_score is None: - min_face_score = float(os.getenv("MACHINE_LEARNING_MIN_FACE_SCORE", 0.7)) + min_face_score = settings.min_face_score model = FaceAnalysis( name=model_name, @@ -114,4 +116,4 @@ def _load_facial_recognition( def _get_cache_dir(model_name: str, model_type: str) -> Path: - return Path(cache_folder, device, model_type, model_name) + return Path(settings.cache_folder, device, model_type, model_name) diff --git a/machine-learning/app/schemas.py b/machine-learning/app/schemas.py index 5a27f5b989..ed58e4ea3a 100644 --- a/machine-learning/app/schemas.py +++ b/machine-learning/app/schemas.py @@ -9,14 +9,6 @@ def to_lower_camel(string: str) -> str: return "".join(tokens) -class VisionModelRequest(BaseModel): - image_path: str - - class Config: - alias_generator = to_lower_camel - allow_population_by_field_name = True - - class TextModelRequest(BaseModel): text: str diff --git a/machine-learning/poetry.lock b/machine-learning/poetry.lock index 720f5e205e..0b7eab18d3 100644 --- a/machine-learning/poetry.lock +++ b/machine-learning/poetry.lock @@ -1733,6 +1733,8 @@ files = [ {file = "scikit_image-0.21.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c01e3ab0a1fabfd8ce30686d4401b7ed36e6126c9d4d05cb94abf6bdc46f7ac9"}, {file = "scikit_image-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ef5d8d1099317b7b315b530348cbfa68ab8ce32459de3c074d204166951025c"}, {file = "scikit_image-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b1e96c59cab640ca5c5b22c501524cfaf34cbe0cb51ba73bd9a9ede3fb6e1d"}, + {file = "scikit_image-0.21.0-cp39-cp39-win_amd64.whl", hash = "sha256:9cffcddd2a5594c0a06de2ae3e1e25d662745a26f94fda31520593669677c010"}, + {file = "scikit_image-0.21.0.tar.gz", hash = "sha256:b33e823c54e6f11873ea390ee49ef832b82b9f70752c8759efd09d5a4e3d87f0"}, ] [package.dependencies] @@ -2088,9 +2090,9 @@ opt-einsum = ["opt-einsum (>=3.3)"] [[package]] name = "torch" version = "2.0.1+cpu" -description = "" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false -python-versions = "*" +python-versions = ">=3.8.0" files = [ {file = "torch-2.0.1+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:fec257249ba014c68629a1994b0c6e7356e20e1afc77a87b9941a40e5095285d"}, {file = "torch-2.0.1+cpu-cp310-cp310-win_amd64.whl", hash = "sha256:ca88b499973c4c027e32c4960bf20911d7e984bd0c55cda181dc643559f3d93f"}, @@ -2102,6 +2104,16 @@ files = [ {file = "torch-2.0.1+cpu-cp39-cp39-win_amd64.whl", hash = "sha256:f263f8e908288427ae81441fef540377f61e339a27632b1bbe33cf78292fdaea"}, ] +[package.dependencies] +filelock = "*" +jinja2 = "*" +networkx = "*" +sympy = "*" +typing-extensions = "*" + +[package.extras] +opt-einsum = ["opt-einsum (>=3.3)"] + [package.source] type = "legacy" url = "https://download.pytorch.org/whl/cpu" diff --git a/server/src/infra/repositories/machine-learning.repository.ts b/server/src/infra/repositories/machine-learning.repository.ts index 0f8d1707bd..40398445a0 100644 --- a/server/src/infra/repositories/machine-learning.repository.ts +++ b/server/src/infra/repositories/machine-learning.repository.ts @@ -1,21 +1,26 @@ import { DetectFaceResult, IMachineLearningRepository, MachineLearningInput, MACHINE_LEARNING_URL } from '@app/domain'; import { Injectable } from '@nestjs/common'; import axios from 'axios'; +import { createReadStream } from 'fs'; const client = axios.create({ baseURL: MACHINE_LEARNING_URL }); @Injectable() export class MachineLearningRepository implements IMachineLearningRepository { + private post(input: MachineLearningInput, endpoint: string): Promise { + return client.post(endpoint, createReadStream(input.imagePath)).then((res) => res.data); + } + classifyImage(input: MachineLearningInput): Promise { - return client.post('/image-classifier/tag-image', input).then((res) => res.data); + return this.post(input, '/image-classifier/tag-image'); } detectFaces(input: MachineLearningInput): Promise { - return client.post('/facial-recognition/detect-faces', input).then((res) => res.data); + return this.post(input, '/facial-recognition/detect-faces'); } encodeImage(input: MachineLearningInput): Promise { - return client.post('/sentence-transformer/encode-image', input).then((res) => res.data); + return this.post(input, '/sentence-transformer/encode-image'); } encodeText(input: string): Promise {