feat(ml)!: switch image classification and CLIP models to ONNX (#3809)

2025-06-15 03:30:33 +02:00 · 2023-08-25 00:28:51 -04:00
parent 8211afb726
commit 165b91b068
14 changed files with 1617 additions and 507 deletions
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@ -1,4 +1,6 @@
+import asyncio
 import os
+from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from typing import Any

@ -8,6 +10,8 @@ import uvicorn
 from fastapi import Body, Depends, FastAPI
 from PIL import Image

+from app.models.base import InferenceModel
+
 from .config import settings
 from .models.cache import ModelCache
 from .schemas import (
@ -25,19 +29,21 @@ app = FastAPI()

 def init_state() -> None:
    app.state.model_cache = ModelCache(ttl=settings.model_ttl, revalidate=settings.model_ttl > 0)
+    # asyncio is a huge bottleneck for performance, so we use a thread pool to run blocking code
+    app.state.thread_pool = ThreadPoolExecutor(settings.request_threads)


 async def load_models() -> None:
-    models = [
-        (settings.classification_model, ModelType.IMAGE_CLASSIFICATION),
-        (settings.clip_image_model, ModelType.CLIP),
-        (settings.clip_text_model, ModelType.CLIP),
-        (settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION),
+    models: list[tuple[str, ModelType, dict[str, Any]]] = [
+        (settings.classification_model, ModelType.IMAGE_CLASSIFICATION, {}),
+        (settings.clip_image_model, ModelType.CLIP, {"mode": "vision"}),
+        (settings.clip_text_model, ModelType.CLIP, {"mode": "text"}),
+        (settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION, {}),
    ]

    # Get all models
-    for model_name, model_type in models:
-        await app.state.model_cache.get(model_name, model_type, eager=settings.eager_startup)
+    for model_name, model_type, model_kwargs in models:
+        await app.state.model_cache.get(model_name, model_type, eager=settings.eager_startup, **model_kwargs)


@app.on_event("startup")
@ -46,11 +52,16 @@ async def startup_event() -> None:
    await load_models()


+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    app.state.thread_pool.shutdown()
+
+
 def dep_pil_image(byte_image: bytes = Body(...)) -> Image.Image:
    return Image.open(BytesIO(byte_image))


-def dep_cv_image(byte_image: bytes = Body(...)) -> cv2.Mat:
+def dep_cv_image(byte_image: bytes = Body(...)) -> np.ndarray[int, np.dtype[Any]]:
    byte_image_np = np.frombuffer(byte_image, np.uint8)
    return cv2.imdecode(byte_image_np, cv2.IMREAD_COLOR)

@ -74,7 +85,7 @@ async def image_classification(
    image: Image.Image = Depends(dep_pil_image),
 ) -> list[str]:
    model = await app.state.model_cache.get(settings.classification_model, ModelType.IMAGE_CLASSIFICATION)
-    labels = model.predict(image)
+    labels = await predict(model, image)
    return labels


@ -86,8 +97,8 @@ async def image_classification(
 async def clip_encode_image(
    image: Image.Image = Depends(dep_pil_image),
 ) -> list[float]:
-    model = await app.state.model_cache.get(settings.clip_image_model, ModelType.CLIP)
-    embedding = model.predict(image)
+    model = await app.state.model_cache.get(settings.clip_image_model, ModelType.CLIP, mode="vision")
+    embedding = await predict(model, image)
    return embedding


@ -97,8 +108,8 @@ async def clip_encode_image(
    status_code=200,
 )
 async def clip_encode_text(payload: TextModelRequest) -> list[float]:
-    model = await app.state.model_cache.get(settings.clip_text_model, ModelType.CLIP)
-    embedding = model.predict(payload.text)
+    model = await app.state.model_cache.get(settings.clip_text_model, ModelType.CLIP, mode="text")
+    embedding = await predict(model, payload.text)
    return embedding


@ -111,10 +122,14 @@ async def facial_recognition(
    image: cv2.Mat = Depends(dep_cv_image),
 ) -> list[dict[str, Any]]:
    model = await app.state.model_cache.get(settings.facial_recognition_model, ModelType.FACIAL_RECOGNITION)
-    faces = model.predict(image)
+    faces = await predict(model, image)
    return faces


+async def predict(model: InferenceModel, inputs: Any) -> Any:
+    return await asyncio.get_running_loop().run_in_executor(app.state.thread_pool, model.predict, inputs)
+
+
 if __name__ == "__main__":
    is_dev = os.getenv("NODE_ENV") == "development"
    uvicorn.run(