You've already forked immich
mirror of
https://github.com/immich-app/immich.git
synced 2025-06-15 03:30:33 +02:00
feat(ml)!: switch image classification and CLIP models to ONNX (#3809)
This commit is contained in:
@ -1,4 +1,6 @@
|
||||
import asyncio
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
@ -8,6 +10,8 @@ import uvicorn
|
||||
from fastapi import Body, Depends, FastAPI
|
||||
from PIL import Image
|
||||
|
||||
from app.models.base import InferenceModel
|
||||
|
||||
from .config import settings
|
||||
from .models.cache import ModelCache
|
||||
from .schemas import (
|
||||
@ -25,19 +29,21 @@ app = FastAPI()
|
||||
|
||||
def init_state() -> None:
|
||||
app.state.model_cache = ModelCache(ttl=settings.model_ttl, revalidate=settings.model_ttl > 0)
|
||||
# asyncio is a huge bottleneck for performance, so we use a thread pool to run blocking code
|
||||
app.state.thread_pool = ThreadPoolExecutor(settings.request_threads)
|
||||
|
||||
|
||||
async def load_models() -> None:
|
||||
models = [
|
||||
(settings.classification_model, ModelType.IMAGE_CLASSIFICATION),
|
||||
(settings.clip_image_model, ModelType.CLIP),
|
||||
(settings.clip_text_model, ModelType.CLIP),
|
||||
(settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION),
|
||||
models: list[tuple[str, ModelType, dict[str, Any]]] = [
|
||||
(settings.classification_model, ModelType.IMAGE_CLASSIFICATION, {}),
|
||||
(settings.clip_image_model, ModelType.CLIP, {"mode": "vision"}),
|
||||
(settings.clip_text_model, ModelType.CLIP, {"mode": "text"}),
|
||||
(settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION, {}),
|
||||
]
|
||||
|
||||
# Get all models
|
||||
for model_name, model_type in models:
|
||||
await app.state.model_cache.get(model_name, model_type, eager=settings.eager_startup)
|
||||
for model_name, model_type, model_kwargs in models:
|
||||
await app.state.model_cache.get(model_name, model_type, eager=settings.eager_startup, **model_kwargs)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
@ -46,11 +52,16 @@ async def startup_event() -> None:
|
||||
await load_models()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
app.state.thread_pool.shutdown()
|
||||
|
||||
|
||||
def dep_pil_image(byte_image: bytes = Body(...)) -> Image.Image:
|
||||
return Image.open(BytesIO(byte_image))
|
||||
|
||||
|
||||
def dep_cv_image(byte_image: bytes = Body(...)) -> cv2.Mat:
|
||||
def dep_cv_image(byte_image: bytes = Body(...)) -> np.ndarray[int, np.dtype[Any]]:
|
||||
byte_image_np = np.frombuffer(byte_image, np.uint8)
|
||||
return cv2.imdecode(byte_image_np, cv2.IMREAD_COLOR)
|
||||
|
||||
@ -74,7 +85,7 @@ async def image_classification(
|
||||
image: Image.Image = Depends(dep_pil_image),
|
||||
) -> list[str]:
|
||||
model = await app.state.model_cache.get(settings.classification_model, ModelType.IMAGE_CLASSIFICATION)
|
||||
labels = model.predict(image)
|
||||
labels = await predict(model, image)
|
||||
return labels
|
||||
|
||||
|
||||
@ -86,8 +97,8 @@ async def image_classification(
|
||||
async def clip_encode_image(
|
||||
image: Image.Image = Depends(dep_pil_image),
|
||||
) -> list[float]:
|
||||
model = await app.state.model_cache.get(settings.clip_image_model, ModelType.CLIP)
|
||||
embedding = model.predict(image)
|
||||
model = await app.state.model_cache.get(settings.clip_image_model, ModelType.CLIP, mode="vision")
|
||||
embedding = await predict(model, image)
|
||||
return embedding
|
||||
|
||||
|
||||
@ -97,8 +108,8 @@ async def clip_encode_image(
|
||||
status_code=200,
|
||||
)
|
||||
async def clip_encode_text(payload: TextModelRequest) -> list[float]:
|
||||
model = await app.state.model_cache.get(settings.clip_text_model, ModelType.CLIP)
|
||||
embedding = model.predict(payload.text)
|
||||
model = await app.state.model_cache.get(settings.clip_text_model, ModelType.CLIP, mode="text")
|
||||
embedding = await predict(model, payload.text)
|
||||
return embedding
|
||||
|
||||
|
||||
@ -111,10 +122,14 @@ async def facial_recognition(
|
||||
image: cv2.Mat = Depends(dep_cv_image),
|
||||
) -> list[dict[str, Any]]:
|
||||
model = await app.state.model_cache.get(settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION)
|
||||
faces = model.predict(image)
|
||||
faces = await predict(model, image)
|
||||
return faces
|
||||
|
||||
|
||||
async def predict(model: InferenceModel, inputs: Any) -> Any:
|
||||
return await asyncio.get_running_loop().run_in_executor(app.state.thread_pool, model.predict, inputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
is_dev = os.getenv("NODE_ENV") == "development"
|
||||
uvicorn.run(
|
||||
|
Reference in New Issue
Block a user