diff --git a/machine-learning/src/main.py b/machine-learning/src/main.py index b09a43464f..6243eef74b 100644 --- a/machine-learning/src/main.py +++ b/machine-learning/src/main.py @@ -5,7 +5,7 @@ import uvicorn from insightface.app import FaceAnalysis from transformers import pipeline -from sentence_transformers import SentenceTransformer, util +from sentence_transformers import SentenceTransformer from PIL import Image from fastapi import FastAPI from pydantic import BaseModel @@ -20,22 +20,32 @@ class ClipRequestBody(BaseModel): classification_model = os.getenv( - 'MACHINE_LEARNING_CLASSIFICATION_MODEL', 'microsoft/resnet-50') -object_model = os.getenv('MACHINE_LEARNING_OBJECT_MODEL', 'hustvl/yolos-tiny') -clip_image_model = os.getenv( - 'MACHINE_LEARNING_CLIP_IMAGE_MODEL', 'clip-ViT-B-32') -clip_text_model = os.getenv( - 'MACHINE_LEARNING_CLIP_TEXT_MODEL', 'clip-ViT-B-32') + "MACHINE_LEARNING_CLASSIFICATION_MODEL", "microsoft/resnet-50" +) +object_model = os.getenv("MACHINE_LEARNING_OBJECT_MODEL", "hustvl/yolos-tiny") +clip_image_model = os.getenv("MACHINE_LEARNING_CLIP_IMAGE_MODEL", "clip-ViT-B-32") +clip_text_model = os.getenv("MACHINE_LEARNING_CLIP_TEXT_MODEL", "clip-ViT-B-32") facial_recognition_model = os.getenv( - 'MACHINE_LEARNING_FACIAL_RECOGNITION_MODEL', 'buffalo_l') + "MACHINE_LEARNING_FACIAL_RECOGNITION_MODEL", "buffalo_l" +) -cache_folder = os.getenv('MACHINE_LEARNING_CACHE_FOLDER', '/cache') +cache_folder = os.getenv("MACHINE_LEARNING_CACHE_FOLDER", "/cache") _model_cache = {} app = FastAPI() +@app.on_event("startup") +async def startup_event(): + # Get all models + _get_model(object_model, "object-detection") + _get_model(classification_model, "image-classification") + _get_model(clip_image_model) + _get_model(clip_text_model) + _get_model(facial_recognition_model, "facial-recognition") + + @app.get("/") async def root(): return {"message": "Immich ML"} @@ -48,14 +58,14 @@ def ping(): @app.post("/object-detection/detect-object", status_code=200) def object_detection(payload: MlRequestBody): - model = _get_model(object_model, 'object-detection') + model = _get_model(object_model, "object-detection") assetPath = payload.thumbnailPath return run_engine(model, assetPath) @app.post("/image-classifier/tag-image", status_code=200) def image_classification(payload: MlRequestBody): - model = _get_model(classification_model, 'image-classification') + model = _get_model(classification_model, "image-classification") assetPath = payload.thumbnailPath return run_engine(model, assetPath) @@ -76,31 +86,32 @@ def clip_encode_text(payload: ClipRequestBody): @app.post("/facial-recognition/detect-faces", status_code=200) def facial_recognition(payload: MlRequestBody): - model = _get_model(facial_recognition_model, 'facial-recognition') + model = _get_model(facial_recognition_model, "facial-recognition") assetPath = payload.thumbnailPath img = cv.imread(assetPath) height, width, _ = img.shape results = [] faces = model.get(img) + for face in faces: if face.det_score < 0.7: continue x1, y1, x2, y2 = face.bbox - # min face size as percent of original image - # if (x2 - x1) / width < 0.03 or (y2 - y1) / height < 0.05: - # continue - results.append({ - "imageWidth": width, - "imageHeight": height, - "boundingBox": { - "x1": round(x1), - "y1": round(y1), - "x2": round(x2), - "y2": round(y2), - }, - "score": face.det_score.item(), - "embedding": face.normed_embedding.tolist() - }) + + results.append( + { + "imageWidth": width, + "imageHeight": height, + "boundingBox": { + "x1": round(x1), + "y1": round(y1), + "x2": round(x2), + "y2": round(y2), + }, + "score": face.det_score.item(), + "embedding": face.normed_embedding.tolist(), + } + ) return results @@ -109,11 +120,11 @@ def run_engine(engine, path): predictions = engine(path) for index, pred in enumerate(predictions): - tags = pred['label'].split(', ') - if (pred['score'] > 0.9): + tags = pred["label"].split(", ") + if pred["score"] > 0.9: result = [*result, *tags] - if (len(result) > 1): + if len(result) > 1: result = list(set(result)) return result @@ -121,25 +132,27 @@ def run_engine(engine, path): def _get_model(model, task=None): global _model_cache - key = '|'.join([model, str(task)]) + key = "|".join([model, str(task)]) if key not in _model_cache: if task: - if task == 'facial-recognition': + if task == "facial-recognition": face_model = FaceAnalysis( - name=model, root=cache_folder, allowed_modules=["detection", "recognition"]) + name=model, + root=cache_folder, + allowed_modules=["detection", "recognition"], + ) face_model.prepare(ctx_id=0, det_size=(640, 640)) _model_cache[key] = face_model else: _model_cache[key] = pipeline(model=model, task=task) else: - _model_cache[key] = SentenceTransformer( - model, cache_folder=cache_folder) + _model_cache[key] = SentenceTransformer(model, cache_folder=cache_folder) return _model_cache[key] if __name__ == "__main__": - host = os.getenv('MACHINE_LEARNING_HOST', '0.0.0.0') - port = int(os.getenv('MACHINE_LEARNING_PORT', 3003)) - is_dev = os.getenv('NODE_ENV') == 'development' + host = os.getenv("MACHINE_LEARNING_HOST", "0.0.0.0") + port = int(os.getenv("MACHINE_LEARNING_PORT", 3003)) + is_dev = os.getenv("NODE_ENV") == "development" uvicorn.run("main:app", host=host, port=port, reload=is_dev, workers=1)