1
0
mirror of https://github.com/immich-app/immich.git synced 2024-12-26 10:50:29 +02:00
immich/machine-learning/ann/export/run.py
Fynn Petersen-Frey 753292956e
feat(ml): ARMNN acceleration (#5667)
* feat(ml): ARMNN acceleration for CLIP

* wrap ANN as ONNX-Session

* strict typing

* normalize ARMNN CLIP embedding

* mutex to handle concurrent execution

* make inputs contiguous

* fine-grained locking; concurrent network execution

---------

Co-authored-by: mertalev <101130780+mertalev@users.noreply.github.com>
2024-01-11 18:26:46 +01:00

158 lines
4.9 KiB
Python

import logging
import os
import platform
import subprocess
from abc import abstractmethod
import onnx
import open_clip
import torch
from onnx2torch import convert
from onnxruntime.tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
from tinynn.converter import TFLiteConverter
class ExportBase(torch.nn.Module):
input_shape: tuple[int, ...]
def __init__(self, device: torch.device, name: str):
super().__init__()
self.device = device
self.name = name
self.optimize = 5
self.nchw_transpose = False
@abstractmethod
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor]:
pass
def dummy_input(self) -> torch.FloatTensor:
return torch.rand((1, 3, 224, 224), device=self.device)
class ArcFace(ExportBase):
input_shape = (1, 3, 112, 112)
def __init__(self, onnx_model_path: str, device: torch.device):
name, _ = os.path.splitext(os.path.basename(onnx_model_path))
super().__init__(device, name)
onnx_model = onnx.load_model(onnx_model_path)
make_input_shape_fixed(onnx_model.graph, onnx_model.graph.input[0].name, self.input_shape)
fix_output_shapes(onnx_model)
self.model = convert(onnx_model).to(device)
if self.device.type == "cuda":
self.model = self.model.half()
def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor:
embedding: torch.FloatTensor = self.model(
input_tensor.half() if self.device.type == "cuda" else input_tensor
).float()
assert isinstance(embedding, torch.FloatTensor)
return embedding
def dummy_input(self) -> torch.FloatTensor:
return torch.rand(self.input_shape, device=self.device)
class RetinaFace(ExportBase):
input_shape = (1, 3, 640, 640)
def __init__(self, onnx_model_path: str, device: torch.device):
name, _ = os.path.splitext(os.path.basename(onnx_model_path))
super().__init__(device, name)
self.optimize = 3
self.model = convert(onnx_model_path).eval().to(device)
if self.device.type == "cuda":
self.model = self.model.half()
def forward(self, input_tensor: torch.Tensor) -> tuple[torch.FloatTensor]:
out: torch.Tensor = self.model(input_tensor.half() if self.device.type == "cuda" else input_tensor)
return tuple(o.float() for o in out)
def dummy_input(self) -> torch.FloatTensor:
return torch.rand(self.input_shape, device=self.device)
class ClipVision(ExportBase):
input_shape = (1, 3, 224, 224)
def __init__(self, model_name: str, weights: str, device: torch.device):
super().__init__(device, model_name + "__" + weights)
self.model = open_clip.create_model(
model_name,
weights,
precision="fp16" if device.type == "cuda" else "fp32",
jit=False,
require_pretrained=True,
device=device,
)
def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor:
embedding: torch.Tensor = self.model.encode_image(
input_tensor.half() if self.device.type == "cuda" else input_tensor,
normalize=True,
).float()
return embedding
def export(model: ExportBase) -> None:
model.eval()
for param in model.parameters():
param.requires_grad = False
dummy_input = model.dummy_input()
model(dummy_input)
jit = torch.jit.trace(model, dummy_input) # type: ignore[no-untyped-call,attr-defined]
tflite_model_path = f"output/{model.name}.tflite"
os.makedirs("output", exist_ok=True)
converter = TFLiteConverter(
jit,
dummy_input,
tflite_model_path,
optimize=model.optimize,
nchw_transpose=model.nchw_transpose,
)
# segfaults on ARM, must run on x86_64 / AMD64
converter.convert()
armnn_model_path = f"output/{model.name}.armnn"
os.environ["LD_LIBRARY_PATH"] = "armnn"
subprocess.run(
[
"./armnnconverter",
"-f",
"tflite-binary",
"-m",
tflite_model_path,
"-i",
"input_tensor",
"-o",
"output_tensor",
"-p",
armnn_model_path,
]
)
def main() -> None:
if platform.machine() not in ("x86_64", "AMD64"):
raise RuntimeError(f"Can only run on x86_64 / AMD64, not {platform.machine()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type != "cuda":
logging.warning(
"No CUDA available, cannot create fp16 model! proceeding to create a fp32 model (use only for testing)"
)
models = [
ClipVision("ViT-B-32", "openai", device),
ArcFace("buffalo_l_rec.onnx", device),
RetinaFace("buffalo_l_det.onnx", device),
]
for model in models:
export(model)
if __name__ == "__main__":
with torch.no_grad():
main()