diff --git a/docker/mlaccel-armnn.yml b/docker/mlaccel-armnn.yml new file mode 100644 index 0000000000..fc9aa2521e --- /dev/null +++ b/docker/mlaccel-armnn.yml @@ -0,0 +1,11 @@ +version: "3.8" + +# ML acceleration on supported Mali ARM GPUs using ARM-NN + +services: + mlaccel: + devices: + - /dev/mali0:/dev/mali0 + volumes: + - /lib/firmware/mali_csffw.bin:/lib/firmware/mali_csffw.bin:ro # Mali firmware for your chipset (not always required depending on the driver) + - /usr/lib/libmali.so:/usr/lib/libmali.so:ro # Mali driver for your chipset (always required) diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index f0c2214931..b32a564dfb 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -13,17 +13,40 @@ ENV VIRTUAL_ENV="/opt/venv" PATH="/opt/venv/bin:${PATH}" COPY poetry.lock pyproject.toml ./ RUN poetry install --sync --no-interaction --no-ansi --no-root --only main -FROM python:3.11-slim-bookworm@sha256:8f64a67710f3d981cf3008d6f9f1dbe61accd7927f165f4e37ea3f8b883ccc3f +ARG TARGETPLATFORM +ENV ARMNN_PATH=/opt/armnn +COPY ann /opt/ann +RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + mkdir /opt/armnn && \ + curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \ + cd /opt/ann && \ + sh build.sh; \ + else \ + mkdir /opt/armnn; \ + fi + + +FROM python:3.11-slim-bookworm@sha256:8f64a67710f3d981cf3008d6f9f1dbe61accd7927f165f4e37ea3f8b883ccc3f +ARG TARGETPLATFORM RUN apt-get update && apt-get install -y --no-install-recommends tini libmimalloc2.0 && rm -rf /var/lib/apt/lists/* +RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd && \ + rm -rf /var/lib/apt/lists/* && \ + mkdir --parents /etc/OpenCL/vendors && \ + echo "/usr/lib/libmali.so" > /etc/OpenCL/vendors/mali.icd && \ + mkdir /opt/armnn; \ + fi + WORKDIR /usr/src/app ENV NODE_ENV=production \ TRANSFORMERS_CACHE=/cache \ PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PATH="/opt/venv/bin:$PATH" \ - PYTHONPATH=/usr/src + PYTHONPATH=/usr/src \ + LD_LIBRARY_PATH=/opt/armnn # prevent core dumps RUN echo "hard core 0" >> /etc/security/limits.conf && \ @@ -31,7 +54,10 @@ RUN echo "hard core 0" >> /etc/security/limits.conf && \ echo 'ulimit -S -c 0 > /dev/null 2>&1' >> /etc/profile COPY --from=builder /opt/venv /opt/venv +COPY --from=builder /opt/armnn/libarmnn.so.?? /opt/armnn/libarmnnOnnxParser.so.?? /opt/armnn/libarmnnDeserializer.so.?? /opt/armnn/libarmnnTfLiteParser.so.?? /opt/armnn/libprotobuf.so.?.??.?.? /opt/ann/libann.s[o] /opt/ann/build.sh /opt/armnn +COPY ann/ann.py /usr/src/ann/ann.py COPY start.sh log_conf.json ./ COPY app . + ENTRYPOINT ["tini", "--"] CMD ["./start.sh"] diff --git a/machine-learning/ann/__init__.py b/machine-learning/ann/__init__.py new file mode 100644 index 0000000000..0793d1011b --- /dev/null +++ b/machine-learning/ann/__init__.py @@ -0,0 +1 @@ +from .ann import Ann, is_available diff --git a/machine-learning/ann/ann.cpp b/machine-learning/ann/ann.cpp new file mode 100644 index 0000000000..d0010f690b --- /dev/null +++ b/machine-learning/ann/ann.cpp @@ -0,0 +1,281 @@ +#include +#include +#include + +#include "armnn/IRuntime.hpp" +#include "armnn/INetwork.hpp" +#include "armnn/Types.hpp" +#include "armnnDeserializer/IDeserializer.hpp" +#include "armnnTfLiteParser/ITfLiteParser.hpp" +#include "armnnOnnxParser/IOnnxParser.hpp" + +using namespace armnn; + +struct IOInfos +{ + std::vector inputInfos; + std::vector outputInfos; +}; + +// from https://rigtorp.se/spinlock/ +struct SpinLock +{ + std::atomic lock_ = {false}; + + void lock() + { + for (;;) + { + if (!lock_.exchange(true, std::memory_order_acquire)) + { + break; + } + while (lock_.load(std::memory_order_relaxed)) + ; + } + } + + void unlock() { lock_.store(false, std::memory_order_release); } +}; + +class Ann +{ + +public: + int load(const char *modelPath, + bool fastMath, + bool fp16, + bool saveCachedNetwork, + const char *cachedNetworkPath) + { + INetworkPtr network = loadModel(modelPath); + IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath); + const IOInfos infos = getIOInfos(optNet.get()); + NetworkId netId; + mutex.lock(); + Status status = runtime->LoadNetwork(netId, std::move(optNet)); + mutex.unlock(); + if (status != Status::Success) + { + return -1; + } + spinLock.lock(); + ioInfos[netId] = infos; + mutexes.emplace(netId, std::make_unique()); + spinLock.unlock(); + return netId; + } + + void execute(NetworkId netId, const void **inputData, void **outputData) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + auto m = mutexes[netId].get(); + spinLock.unlock(); + InputTensors inputTensors; + inputTensors.reserve(infos->inputInfos.size()); + size_t i = 0; + for (const BindingPointInfo &info : infos->inputInfos) + inputTensors.emplace_back(info.first, ConstTensor(info.second, inputData[i++])); + OutputTensors outputTensors; + outputTensors.reserve(infos->outputInfos.size()); + i = 0; + for (const BindingPointInfo &info : infos->outputInfos) + outputTensors.emplace_back(info.first, Tensor(info.second, outputData[i++])); + m->lock(); + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + m->unlock(); + } + + void unload(NetworkId netId) + { + mutex.lock(); + runtime->UnloadNetwork(netId); + mutex.unlock(); + } + + int tensors(NetworkId netId, bool isInput = false) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + spinLock.unlock(); + return (int)(isInput ? infos->inputInfos.size() : infos->outputInfos.size()); + } + + unsigned long shape(NetworkId netId, bool isInput = false, int index = 0) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + spinLock.unlock(); + const TensorShape shape = (isInput ? infos->inputInfos : infos->outputInfos)[index].second.GetShape(); + unsigned long s = 0; + for (unsigned int d = 0; d < shape.GetNumDimensions(); d++) + s |= ((unsigned long)shape[d]) << (d * 16); // stores up to 4 16-bit values in a 64-bit value + return s; + } + + Ann(int tuningLevel, const char *tuningFile) + { + IRuntime::CreationOptions runtimeOptions; + BackendOptions backendOptions{"GpuAcc", + { + {"TuningLevel", tuningLevel}, + {"MemoryOptimizerStrategy", "ConstantMemoryStrategy"}, // SingleAxisPriorityList or ConstantMemoryStrategy + }}; + if (tuningFile) + backendOptions.AddOption({"TuningFile", tuningFile}); + runtimeOptions.m_BackendOptions.emplace_back(backendOptions); + runtime = IRuntime::CreateRaw(runtimeOptions); + }; + ~Ann() + { + IRuntime::Destroy(runtime); + }; + +private: + INetworkPtr loadModel(const char *modelPath) + { + const auto path = std::string(modelPath); + if (path.rfind(".tflite") == path.length() - 7) // endsWith() + { + auto parser = armnnTfLiteParser::ITfLiteParser::CreateRaw(); + return parser->CreateNetworkFromBinaryFile(modelPath); + } + else if (path.rfind(".onnx") == path.length() - 5) // endsWith() + { + auto parser = armnnOnnxParser::IOnnxParser::CreateRaw(); + return parser->CreateNetworkFromBinaryFile(modelPath); + } + else + { + std::ifstream ifs(path, std::ifstream::in | std::ifstream::binary); + auto parser = armnnDeserializer::IDeserializer::CreateRaw(); + return parser->CreateNetworkFromBinary(ifs); + } + } + + static BindingPointInfo getInputTensorInfo(LayerBindingId inputBindingId, TensorInfo info) + { + const auto newInfo = TensorInfo{info.GetShape(), info.GetDataType(), + info.GetQuantizationScale(), + info.GetQuantizationOffset(), + true}; + return {inputBindingId, newInfo}; + } + + IOptimizedNetworkPtr OptimizeNetwork(INetwork *network, bool fastMath, bool fp16, bool saveCachedNetwork, const char *cachedNetworkPath) + { + const bool allowExpandedDims = false; + const ShapeInferenceMethod shapeInferenceMethod = ShapeInferenceMethod::ValidateOnly; + + OptimizerOptionsOpaque options; + options.SetReduceFp32ToFp16(fp16); + options.SetShapeInferenceMethod(shapeInferenceMethod); + options.SetAllowExpandedDims(allowExpandedDims); + + BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}}); + if (cachedNetworkPath) + { + gpuAcc.AddOption({"SaveCachedNetwork", saveCachedNetwork}); + gpuAcc.AddOption({"CachedNetworkFilePath", cachedNetworkPath}); + } + options.AddModelOption(gpuAcc); + + // No point in using ARMNN for CPU, use ONNX (quantized) instead. + // BackendOptions cpuAcc("CpuAcc", + // { + // {"FastMathEnabled", fastMath}, + // {"NumberOfThreads", 0}, + // }); + // options.AddModelOption(cpuAcc); + + BackendOptions allowExDimOpt("AllowExpandedDims", + {{"AllowExpandedDims", allowExpandedDims}}); + options.AddModelOption(allowExDimOpt); + BackendOptions shapeInferOpt("ShapeInferenceMethod", + {{"InferAndValidate", shapeInferenceMethod == ShapeInferenceMethod::InferAndValidate}}); + options.AddModelOption(shapeInferOpt); + + std::vector backends = { + BackendId("GpuAcc"), + // BackendId("CpuAcc"), + // BackendId("CpuRef"), + }; + return Optimize(*network, backends, runtime->GetDeviceSpec(), options); + } + + IOInfos getIOInfos(IOptimizedNetwork *optNet) + { + struct InfoStrategy : IStrategy + { + void ExecuteStrategy(const IConnectableLayer *layer, + const BaseDescriptor &descriptor, + const std::vector &constants, + const char *name, + const LayerBindingId id = 0) override + { + IgnoreUnused(descriptor, constants, id); + const LayerType lt = layer->GetType(); + if (lt == LayerType::Input) + ioInfos.inputInfos.push_back(getInputTensorInfo(id, layer->GetOutputSlot(0).GetTensorInfo())); + else if (lt == LayerType::Output) + ioInfos.outputInfos.push_back({id, layer->GetInputSlot(0).GetTensorInfo()}); + } + IOInfos ioInfos; + }; + + InfoStrategy infoStrategy; + optNet->ExecuteStrategy(infoStrategy); + return infoStrategy.ioInfos; + } + + IRuntime *runtime; + std::map ioInfos; + std::map> mutexes; // mutex per network to not execute the same the same network concurrently + std::mutex mutex; // global mutex for load/unload calls to the runtime + SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps +}; + +extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile) +{ + LogSeverity level = static_cast(logLevel); + ConfigureLogging(true, true, level); + + Ann *ann = new Ann(tuningLevel, tuningFile); + return ann; +} + +extern "C" void destroy(void *ann) +{ + delete ((Ann *)ann); +} + +extern "C" int load(void *ann, + const char *path, + bool fastMath, + bool fp16, + bool saveCachedNetwork, + const char *cachedNetworkPath) +{ + return ((Ann *)ann)->load(path, fastMath, fp16, saveCachedNetwork, cachedNetworkPath); +} + +extern "C" void unload(void *ann, NetworkId netId) +{ + ((Ann *)ann)->unload(netId); +} + +extern "C" void execute(void *ann, NetworkId netId, const void **inputData, void **outputData) +{ + ((Ann *)ann)->execute(netId, inputData, outputData); +} + +extern "C" unsigned long shape(void *ann, NetworkId netId, bool isInput, int index) +{ + return ((Ann *)ann)->shape(netId, isInput, index); +} + +extern "C" int tensors(void *ann, NetworkId netId, bool isInput) +{ + return ((Ann *)ann)->tensors(netId, isInput); +} \ No newline at end of file diff --git a/machine-learning/ann/ann.py b/machine-learning/ann/ann.py new file mode 100644 index 0000000000..8f8c5473c7 --- /dev/null +++ b/machine-learning/ann/ann.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +from ctypes import CDLL, Array, c_bool, c_char_p, c_int, c_ulong, c_void_p +from os.path import exists +from typing import Any, Generic, Protocol, Type, TypeVar + +import numpy as np +from numpy.typing import NDArray + +from app.config import log + +try: + CDLL("libmali.so") # fail if libmali.so is not mounted into container + libann = CDLL("libann.so") + libann.init.argtypes = c_int, c_int, c_char_p + libann.init.restype = c_void_p + libann.load.argtypes = c_void_p, c_char_p, c_bool, c_bool, c_bool, c_char_p + libann.load.restype = c_int + libann.execute.argtypes = c_void_p, c_int, Array[c_void_p], Array[c_void_p] + libann.unload.argtypes = c_void_p, c_int + libann.destroy.argtypes = (c_void_p,) + libann.shape.argtypes = c_void_p, c_int, c_bool, c_int + libann.shape.restype = c_ulong + libann.tensors.argtypes = c_void_p, c_int, c_bool + libann.tensors.restype = c_int + is_available = True +except OSError as e: + log.debug("Could not load ANN shared libraries, using ONNX: %s", e) + is_available = False + +T = TypeVar("T", covariant=True) + + +class Newable(Protocol[T]): + def new(self) -> None: + ... + + +class _Singleton(type, Newable[T]): + _instances: dict[_Singleton[T], Newable[T]] = {} + + def __call__(cls, *args: Any, **kwargs: Any) -> Newable[T]: + if cls not in cls._instances: + obj: Newable[T] = super(_Singleton, cls).__call__(*args, **kwargs) + cls._instances[cls] = obj + else: + obj = cls._instances[cls] + obj.new() + return obj + + +class Ann(metaclass=_Singleton): + def __init__(self, log_level: int = 3, tuning_level: int = 1, tuning_file: str | None = None) -> None: + if not is_available: + raise RuntimeError("libann is not available!") + if tuning_file and not exists(tuning_file): + raise ValueError("tuning_file must point to an existing (possibly empty) file!") + if tuning_level == 0 and tuning_file is None: + raise ValueError("tuning_level == 0 reads existing tuning information and requires a tuning_file") + if tuning_level < 0 or tuning_level > 3: + raise ValueError("tuning_level must be 0 (load from tuning_file), 1, 2 or 3.") + if log_level < 0 or log_level > 5: + raise ValueError("log_level must be 0 (trace), 1 (debug), 2 (info), 3 (warning), 4 (error) or 5 (fatal)") + self.log_level = log_level + self.tuning_level = tuning_level + self.tuning_file = tuning_file + self.output_shapes: dict[int, tuple[tuple[int], ...]] = {} + self.input_shapes: dict[int, tuple[tuple[int], ...]] = {} + self.ann: int | None = None + self.new() + + def new(self) -> None: + if self.ann is None: + self.ann = libann.init( + self.log_level, + self.tuning_level, + self.tuning_file.encode() if self.tuning_file is not None else None, + ) + self.ref_count = 0 + + self.ref_count += 1 + + def destroy(self) -> None: + self.ref_count -= 1 + if self.ref_count <= 0 and self.ann is not None: + libann.destroy(self.ann) + self.ann = None + + def __del__(self) -> None: + if self.ann is not None: + libann.destroy(self.ann) + self.ann = None + + def load( + self, + model_path: str, + fast_math: bool = True, + fp16: bool = False, + save_cached_network: bool = False, + cached_network_path: str | None = None, + ) -> int: + if not model_path.endswith((".armnn", ".tflite", ".onnx")): + raise ValueError("model_path must be a file with extension .armnn, .tflite or .onnx") + if not exists(model_path): + raise ValueError("model_path must point to an existing file!") + if cached_network_path is not None and not exists(cached_network_path): + raise ValueError("cached_network_path must point to an existing (possibly empty) file!") + if save_cached_network and cached_network_path is None: + raise ValueError("save_cached_network is True, cached_network_path must be specified!") + net_id: int = libann.load( + self.ann, + model_path.encode(), + fast_math, + fp16, + save_cached_network, + cached_network_path.encode() if cached_network_path is not None else None, + ) + + self.input_shapes[net_id] = tuple( + self.shape(net_id, input=True, index=i) for i in range(self.tensors(net_id, input=True)) + ) + self.output_shapes[net_id] = tuple( + self.shape(net_id, input=False, index=i) for i in range(self.tensors(net_id, input=False)) + ) + return net_id + + def unload(self, network_id: int) -> None: + libann.unload(self.ann, network_id) + del self.output_shapes[network_id] + + def execute(self, network_id: int, input_tensors: list[NDArray[np.float32]]) -> list[NDArray[np.float32]]: + if not isinstance(input_tensors, list): + raise ValueError("input_tensors needs to be a list!") + net_input_shapes = self.input_shapes[network_id] + if len(input_tensors) != len(net_input_shapes): + raise ValueError(f"input_tensors lengths {len(input_tensors)} != network inputs {len(net_input_shapes)}") + for net_input_shape, input_tensor in zip(net_input_shapes, input_tensors): + if net_input_shape != input_tensor.shape: + raise ValueError(f"input_tensor shape {input_tensor.shape} != network input shape {net_input_shape}") + if not input_tensor.flags.c_contiguous: + raise ValueError("input_tensors must be c_contiguous numpy ndarrays") + output_tensors: list[NDArray[np.float32]] = [ + np.ndarray(s, dtype=np.float32) for s in self.output_shapes[network_id] + ] + input_type = c_void_p * len(input_tensors) + inputs = input_type(*[t.ctypes.data_as(c_void_p) for t in input_tensors]) + output_type = c_void_p * len(output_tensors) + outputs = output_type(*[t.ctypes.data_as(c_void_p) for t in output_tensors]) + libann.execute(self.ann, network_id, inputs, outputs) + return output_tensors + + def shape(self, network_id: int, input: bool = False, index: int = 0) -> tuple[int]: + s = libann.shape(self.ann, network_id, input, index) + a = [] + while s != 0: + a.append(s & 0xFFFF) + s >>= 16 + return tuple(a) + + def tensors(self, network_id: int, input: bool = False) -> int: + tensors: int = libann.tensors(self.ann, network_id, input) + return tensors diff --git a/machine-learning/ann/build.sh b/machine-learning/ann/build.sh new file mode 100644 index 0000000000..d90fa1ae1b --- /dev/null +++ b/machine-learning/ann/build.sh @@ -0,0 +1 @@ +g++ -shared -O3 -o libann.so -fuse-ld=gold -std=c++17 -I$ARMNN_PATH/include -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -L$ARMNN_PATH ann.cpp diff --git a/machine-learning/ann/export/.gitignore b/machine-learning/ann/export/.gitignore new file mode 100644 index 0000000000..eeebcd1c35 --- /dev/null +++ b/machine-learning/ann/export/.gitignore @@ -0,0 +1,2 @@ +armnn* +output/ diff --git a/machine-learning/ann/export/build-converter.sh b/machine-learning/ann/export/build-converter.sh new file mode 100755 index 0000000000..0deb2e7ed5 --- /dev/null +++ b/machine-learning/ann/export/build-converter.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd armnn-23.11/ +g++ -o ../armnnconverter -O1 -DARMNN_ONNX_PARSER -DARMNN_SERIALIZER -DARMNN_TF_LITE_PARSER -fuse-ld=gold -std=c++17 -Iinclude -Isrc/armnnUtils -Ithird-party -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -larmnnSerializer -L../armnn src/armnnConverter/ArmnnConverter.cpp diff --git a/machine-learning/ann/export/download-armnn.sh b/machine-learning/ann/export/download-armnn.sh new file mode 100755 index 0000000000..e138e34f57 --- /dev/null +++ b/machine-learning/ann/export/download-armnn.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# binaries +mkdir armnn +curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-x86_64.tar.gz" | tar -zx -C armnn + +# source to build ArmnnConverter +curl -SL "https://github.com/ARM-software/armnn/archive/refs/tags/v23.11.tar.gz" | tar -zx diff --git a/machine-learning/ann/export/env.yaml b/machine-learning/ann/export/env.yaml new file mode 100644 index 0000000000..c5e656cd46 --- /dev/null +++ b/machine-learning/ann/export/env.yaml @@ -0,0 +1,201 @@ +name: annexport +channels: + - pytorch + - nvidia + - conda-forge +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_kmp_llvm + - aiohttp=3.9.1=py310h2372a71_0 + - aiosignal=1.3.1=pyhd8ed1ab_0 + - arpack=3.8.0=nompi_h0baa96a_101 + - async-timeout=4.0.3=pyhd8ed1ab_0 + - attrs=23.1.0=pyh71513ae_1 + - aws-c-auth=0.7.3=h28f7589_1 + - aws-c-cal=0.6.1=hc309b26_1 + - aws-c-common=0.9.0=hd590300_0 + - aws-c-compression=0.2.17=h4d4d85c_2 + - aws-c-event-stream=0.3.1=h2e3709c_4 + - aws-c-http=0.7.11=h00aa349_4 + - aws-c-io=0.13.32=he9a53bd_1 + - aws-c-mqtt=0.9.3=hb447be9_1 + - aws-c-s3=0.3.14=hf3aad02_1 + - aws-c-sdkutils=0.1.12=h4d4d85c_1 + - aws-checksums=0.1.17=h4d4d85c_1 + - aws-crt-cpp=0.21.0=hb942446_5 + - aws-sdk-cpp=1.10.57=h85b1a90_19 + - blas=2.120=openblas + - blas-devel=3.9.0=20_linux64_openblas + - brotli-python=1.0.9=py310hd8f1fbe_9 + - bzip2=1.0.8=hd590300_5 + - c-ares=1.23.0=hd590300_0 + - ca-certificates=2023.11.17=hbcca054_0 + - certifi=2023.11.17=pyhd8ed1ab_0 + - charset-normalizer=3.3.2=pyhd8ed1ab_0 + - click=8.1.7=unix_pyh707e725_0 + - colorama=0.4.6=pyhd8ed1ab_0 + - coloredlogs=15.0.1=pyhd8ed1ab_3 + - cuda-cudart=11.7.99=0 + - cuda-cupti=11.7.101=0 + - cuda-libraries=11.7.1=0 + - cuda-nvrtc=11.7.99=0 + - cuda-nvtx=11.7.91=0 + - cuda-runtime=11.7.1=0 + - dataclasses=0.8=pyhc8e2a94_3 + - datasets=2.14.7=pyhd8ed1ab_0 + - dill=0.3.7=pyhd8ed1ab_0 + - filelock=3.13.1=pyhd8ed1ab_0 + - flatbuffers=23.5.26=h59595ed_1 + - freetype=2.12.1=h267a509_2 + - frozenlist=1.4.0=py310h2372a71_1 + - fsspec=2023.10.0=pyhca7485f_0 + - ftfy=6.1.3=pyhd8ed1ab_0 + - gflags=2.2.2=he1b5a44_1004 + - glog=0.6.0=h6f12383_0 + - glpk=5.0=h445213a_0 + - gmp=6.3.0=h59595ed_0 + - gmpy2=2.1.2=py310h3ec546c_1 + - huggingface_hub=0.17.3=pyhd8ed1ab_0 + - humanfriendly=10.0=pyhd8ed1ab_6 + - icu=73.2=h59595ed_0 + - idna=3.6=pyhd8ed1ab_0 + - importlib-metadata=7.0.0=pyha770c72_0 + - importlib_metadata=7.0.0=hd8ed1ab_0 + - joblib=1.3.2=pyhd8ed1ab_0 + - keyutils=1.6.1=h166bdaf_0 + - krb5=1.21.2=h659d440_0 + - lcms2=2.15=h7f713cb_2 + - ld_impl_linux-64=2.40=h41732ed_0 + - lerc=4.0.0=h27087fc_0 + - libabseil=20230125.3=cxx17_h59595ed_0 + - libarrow=12.0.1=hb87d912_8_cpu + - libblas=3.9.0=20_linux64_openblas + - libbrotlicommon=1.0.9=h166bdaf_9 + - libbrotlidec=1.0.9=h166bdaf_9 + - libbrotlienc=1.0.9=h166bdaf_9 + - libcblas=3.9.0=20_linux64_openblas + - libcrc32c=1.1.2=h9c3ff4c_0 + - libcublas=11.10.3.66=0 + - libcufft=10.7.2.124=h4fbf590_0 + - libcufile=1.8.1.2=0 + - libcurand=10.3.4.101=0 + - libcurl=8.5.0=hca28451_0 + - libcusolver=11.4.0.1=0 + - libcusparse=11.7.4.91=0 + - libdeflate=1.19=hd590300_0 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=hd590300_2 + - libevent=2.1.12=hf998b51_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.2.0=h807b86a_3 + - libgfortran-ng=13.2.0=h69a702a_3 + - libgfortran5=13.2.0=ha4646dd_3 + - libgoogle-cloud=2.12.0=hac9eb74_1 + - libgrpc=1.54.3=hb20ce57_0 + - libhwloc=2.9.3=default_h554bfaf_1009 + - libiconv=1.17=hd590300_1 + - libjpeg-turbo=2.1.5.1=hd590300_1 + - liblapack=3.9.0=20_linux64_openblas + - liblapacke=3.9.0=20_linux64_openblas + - libnghttp2=1.58.0=h47da74e_1 + - libnpp=11.7.4.75=0 + - libnsl=2.0.1=hd590300_0 + - libnuma=2.0.16=h0b41bf4_1 + - libnvjpeg=11.8.0.2=0 + - libopenblas=0.3.25=pthreads_h413a1c8_0 + - libpng=1.6.39=h753d276_0 + - libprotobuf=3.21.12=hfc55251_2 + - libsentencepiece=0.1.99=h180e1df_0 + - libsqlite=3.44.2=h2797004_0 + - libssh2=1.11.0=h0841786_0 + - libstdcxx-ng=13.2.0=h7e041cc_3 + - libthrift=0.18.1=h8fd135c_2 + - libtiff=4.6.0=h29866fb_1 + - libutf8proc=2.8.0=h166bdaf_0 + - libuuid=2.38.1=h0b41bf4_0 + - libwebp-base=1.3.2=hd590300_0 + - libxcb=1.15=h0b41bf4_0 + - libxml2=2.11.6=h232c23b_0 + - libzlib=1.2.13=hd590300_5 + - llvm-openmp=17.0.6=h4dfa4b3_0 + - lz4-c=1.9.4=hcb278e6_0 + - mkl=2022.2.1=h84fe81f_16997 + - mkl-devel=2022.2.1=ha770c72_16998 + - mkl-include=2022.2.1=h84fe81f_16997 + - mpc=1.3.1=hfe3b2da_0 + - mpfr=4.2.1=h9458935_0 + - mpmath=1.3.0=pyhd8ed1ab_0 + - multidict=6.0.4=py310h2372a71_1 + - multiprocess=0.70.15=py310h2372a71_1 + - ncurses=6.4=h59595ed_2 + - numpy=1.26.2=py310hb13e2d6_0 + - onnx=1.14.0=py310ha3deec4_1 + - onnx2torch=1.5.13=pyhd8ed1ab_0 + - onnxruntime=1.16.3=py310hd4b7fbc_1_cpu + - open-clip-torch=2.23.0=pyhd8ed1ab_1 + - openblas=0.3.25=pthreads_h7a3da1a_0 + - openjpeg=2.5.0=h488ebb8_3 + - openssl=3.2.0=hd590300_1 + - orc=1.9.0=h2f23424_1 + - packaging=23.2=pyhd8ed1ab_0 + - pandas=2.1.4=py310hcc13569_0 + - pillow=10.0.1=py310h29da1c1_1 + - pip=23.3.1=pyhd8ed1ab_0 + - protobuf=4.21.12=py310heca2aa9_0 + - pthread-stubs=0.4=h36c2ea0_1001 + - pyarrow=12.0.1=py310h0576679_8_cpu + - pyarrow-hotfix=0.6=pyhd8ed1ab_0 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.10.13=hd12c33a_0_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-flatbuffers=23.5.26=pyhd8ed1ab_0 + - python-tzdata=2023.3=pyhd8ed1ab_0 + - python-xxhash=3.4.1=py310h2372a71_0 + - python_abi=3.10=4_cp310 + - pytorch=1.13.1=cpu_py310hd11e9c7_1 + - pytorch-cuda=11.7=h778d358_5 + - pytorch-mutex=1.0=cuda + - pytz=2023.3.post1=pyhd8ed1ab_0 + - pyyaml=6.0.1=py310h2372a71_1 + - rdma-core=28.9=h59595ed_1 + - re2=2023.03.02=h8c504da_0 + - readline=8.2=h8228510_1 + - regex=2023.10.3=py310h2372a71_0 + - requests=2.31.0=pyhd8ed1ab_0 + - s2n=1.3.49=h06160fa_0 + - sacremoses=0.0.53=pyhd8ed1ab_0 + - safetensors=0.3.3=py310hcb5633a_1 + - sentencepiece=0.1.99=hff52083_0 + - sentencepiece-python=0.1.99=py310hebdb9f0_0 + - sentencepiece-spm=0.1.99=h180e1df_0 + - setuptools=68.2.2=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - sleef=3.5.1=h9b69904_2 + - snappy=1.1.10=h9fff704_0 + - sympy=1.12=pypyh9d50eac_103 + - tbb=2021.11.0=h00ab1b0_0 + - texttable=1.7.0=pyhd8ed1ab_0 + - timm=0.9.12=pyhd8ed1ab_0 + - tk=8.6.13=noxft_h4845f30_101 + - tokenizers=0.14.1=py310h320607d_2 + - torchvision=0.14.1=cpu_py310hd3d2ac3_1 + - tqdm=4.66.1=pyhd8ed1ab_0 + - transformers=4.35.2=pyhd8ed1ab_0 + - typing-extensions=4.9.0=hd8ed1ab_0 + - typing_extensions=4.9.0=pyha770c72_0 + - tzdata=2023c=h71feb2d_0 + - ucx=1.14.1=h64cca9d_5 + - urllib3=2.1.0=pyhd8ed1ab_0 + - wcwidth=0.2.12=pyhd8ed1ab_0 + - wheel=0.42.0=pyhd8ed1ab_0 + - xorg-libxau=1.0.11=hd590300_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xxhash=0.8.2=hd590300_0 + - xz=5.2.6=h166bdaf_0 + - yaml=0.2.5=h7f98852_2 + - yarl=1.9.3=py310h2372a71_0 + - zipp=3.17.0=pyhd8ed1ab_0 + - zlib=1.2.13=hd590300_5 + - zstd=1.5.5=hfc55251_0 + - pip: + - git+https://github.com/fyfrey/TinyNeuralNetwork.git diff --git a/machine-learning/ann/export/run.py b/machine-learning/ann/export/run.py new file mode 100644 index 0000000000..91c659a02c --- /dev/null +++ b/machine-learning/ann/export/run.py @@ -0,0 +1,157 @@ +import logging +import os +import platform +import subprocess +from abc import abstractmethod + +import onnx +import open_clip +import torch +from onnx2torch import convert +from onnxruntime.tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed +from tinynn.converter import TFLiteConverter + + +class ExportBase(torch.nn.Module): + input_shape: tuple[int, ...] + + def __init__(self, device: torch.device, name: str): + super().__init__() + self.device = device + self.name = name + self.optimize = 5 + self.nchw_transpose = False + + @abstractmethod + def forward(self, input_tensor: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor]: + pass + + def dummy_input(self) -> torch.FloatTensor: + return torch.rand((1, 3, 224, 224), device=self.device) + + +class ArcFace(ExportBase): + input_shape = (1, 3, 112, 112) + + def __init__(self, onnx_model_path: str, device: torch.device): + name, _ = os.path.splitext(os.path.basename(onnx_model_path)) + super().__init__(device, name) + onnx_model = onnx.load_model(onnx_model_path) + make_input_shape_fixed(onnx_model.graph, onnx_model.graph.input[0].name, self.input_shape) + fix_output_shapes(onnx_model) + self.model = convert(onnx_model).to(device) + if self.device.type == "cuda": + self.model = self.model.half() + + def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor: + embedding: torch.FloatTensor = self.model( + input_tensor.half() if self.device.type == "cuda" else input_tensor + ).float() + assert isinstance(embedding, torch.FloatTensor) + return embedding + + def dummy_input(self) -> torch.FloatTensor: + return torch.rand(self.input_shape, device=self.device) + + +class RetinaFace(ExportBase): + input_shape = (1, 3, 640, 640) + + def __init__(self, onnx_model_path: str, device: torch.device): + name, _ = os.path.splitext(os.path.basename(onnx_model_path)) + super().__init__(device, name) + self.optimize = 3 + self.model = convert(onnx_model_path).eval().to(device) + if self.device.type == "cuda": + self.model = self.model.half() + + def forward(self, input_tensor: torch.Tensor) -> tuple[torch.FloatTensor]: + out: torch.Tensor = self.model(input_tensor.half() if self.device.type == "cuda" else input_tensor) + return tuple(o.float() for o in out) + + def dummy_input(self) -> torch.FloatTensor: + return torch.rand(self.input_shape, device=self.device) + + +class ClipVision(ExportBase): + input_shape = (1, 3, 224, 224) + + def __init__(self, model_name: str, weights: str, device: torch.device): + super().__init__(device, model_name + "__" + weights) + self.model = open_clip.create_model( + model_name, + weights, + precision="fp16" if device.type == "cuda" else "fp32", + jit=False, + require_pretrained=True, + device=device, + ) + + def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor: + embedding: torch.Tensor = self.model.encode_image( + input_tensor.half() if self.device.type == "cuda" else input_tensor, + normalize=True, + ).float() + return embedding + + +def export(model: ExportBase) -> None: + model.eval() + for param in model.parameters(): + param.requires_grad = False + dummy_input = model.dummy_input() + model(dummy_input) + jit = torch.jit.trace(model, dummy_input) # type: ignore[no-untyped-call,attr-defined] + tflite_model_path = f"output/{model.name}.tflite" + os.makedirs("output", exist_ok=True) + + converter = TFLiteConverter( + jit, + dummy_input, + tflite_model_path, + optimize=model.optimize, + nchw_transpose=model.nchw_transpose, + ) + # segfaults on ARM, must run on x86_64 / AMD64 + converter.convert() + + armnn_model_path = f"output/{model.name}.armnn" + os.environ["LD_LIBRARY_PATH"] = "armnn" + subprocess.run( + [ + "./armnnconverter", + "-f", + "tflite-binary", + "-m", + tflite_model_path, + "-i", + "input_tensor", + "-o", + "output_tensor", + "-p", + armnn_model_path, + ] + ) + + +def main() -> None: + if platform.machine() not in ("x86_64", "AMD64"): + raise RuntimeError(f"Can only run on x86_64 / AMD64, not {platform.machine()}") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + logging.warning( + "No CUDA available, cannot create fp16 model! proceeding to create a fp32 model (use only for testing)" + ) + models = [ + ClipVision("ViT-B-32", "openai", device), + ArcFace("buffalo_l_rec.onnx", device), + RetinaFace("buffalo_l_det.onnx", device), + ] + for model in models: + export(model) + + +if __name__ == "__main__": + with torch.no_grad(): + main() diff --git a/machine-learning/app/config.py b/machine-learning/app/config.py index a0bc01d9a6..a34f47f2f1 100644 --- a/machine-learning/app/config.py +++ b/machine-learning/app/config.py @@ -26,6 +26,7 @@ class Settings(BaseSettings): request_threads: int = os.cpu_count() or 4 model_inter_op_threads: int = 1 model_intra_op_threads: int = 2 + ann: bool = True class Config: env_prefix = "MACHINE_LEARNING_" diff --git a/machine-learning/app/models/ann.py b/machine-learning/app/models/ann.py new file mode 100644 index 0000000000..55781729fa --- /dev/null +++ b/machine-learning/app/models/ann.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, NamedTuple + +from numpy import ascontiguousarray + +from ann.ann import Ann +from app.schemas import ndarray_f32, ndarray_i32 + +from ..config import log, settings + + +class AnnSession: + """ + Wrapper for ANN to be drop-in replacement for ONNX session. + """ + + def __init__(self, model_path: Path): + tuning_file = Path(settings.cache_folder) / "gpu-tuning.ann" + with tuning_file.open(mode="a"): + # make sure tuning file exists (without clearing contents) + # once filled, the tuning file reduces the cost/time of the first + # inference after model load by 10s of seconds + pass + self.ann = Ann(tuning_level=3, tuning_file=tuning_file.as_posix()) + log.info("Loading ANN model %s ...", model_path) + cache_file = model_path.with_suffix(".anncache") + save = False + if not cache_file.is_file(): + save = True + with cache_file.open(mode="a"): + # create empty model cache file + pass + + self.model = self.ann.load( + model_path.as_posix(), + save_cached_network=save, + cached_network_path=cache_file.as_posix(), + ) + log.info("Loaded ANN model with ID %d", self.model) + + def __del__(self) -> None: + self.ann.unload(self.model) + log.info("Unloaded ANN model %d", self.model) + self.ann.destroy() + + def get_inputs(self) -> list[AnnNode]: + shapes = self.ann.input_shapes[self.model] + return [AnnNode(None, s) for s in shapes] + + def get_outputs(self) -> list[AnnNode]: + shapes = self.ann.output_shapes[self.model] + return [AnnNode(None, s) for s in shapes] + + def run( + self, + output_names: list[str] | None, + input_feed: dict[str, ndarray_f32] | dict[str, ndarray_i32], + run_options: Any = None, + ) -> list[ndarray_f32]: + inputs: list[ndarray_f32] = [ascontiguousarray(v) for v in input_feed.values()] + return self.ann.execute(self.model, inputs) + + +class AnnNode(NamedTuple): + name: str | None + shape: tuple[int, ...] diff --git a/machine-learning/app/models/base.py b/machine-learning/app/models/base.py index ecdd4ba2a9..4f819abd00 100644 --- a/machine-learning/app/models/base.py +++ b/machine-learning/app/models/base.py @@ -10,8 +10,11 @@ import onnxruntime as ort from huggingface_hub import snapshot_download from typing_extensions import Buffer +import ann.ann + from ..config import get_cache_dir, get_hf_model_name, log, settings from ..schemas import ModelType +from .ann import AnnSession class InferenceModel(ABC): @@ -138,6 +141,21 @@ class InferenceModel(ABC): self.cache_dir.unlink() self.cache_dir.mkdir(parents=True, exist_ok=True) + def _make_session(self, model_path: Path) -> AnnSession | ort.InferenceSession: + armnn_path = model_path.with_suffix(".armnn") + if settings.ann and ann.ann.is_available and armnn_path.is_file(): + session = AnnSession(armnn_path) + elif model_path.is_file(): + session = ort.InferenceSession( + model_path.as_posix(), + sess_options=self.sess_options, + providers=self.providers, + provider_options=self.provider_options, + ) + else: + raise ValueError(f"the file model_path='{model_path}' does not exist") + return session + # HF deep copies configs, so we need to make session options picklable class PicklableSessionOptions(ort.SessionOptions): # type: ignore[misc] diff --git a/machine-learning/app/models/clip.py b/machine-learning/app/models/clip.py index c02458f8d2..61ddf9040a 100644 --- a/machine-learning/app/models/clip.py +++ b/machine-learning/app/models/clip.py @@ -6,7 +6,6 @@ from pathlib import Path from typing import Any, Literal import numpy as np -import onnxruntime as ort from PIL import Image from tokenizers import Encoding, Tokenizer @@ -33,24 +32,12 @@ class BaseCLIPEncoder(InferenceModel): def _load(self) -> None: if self.mode == "text" or self.mode is None: log.debug(f"Loading clip text model '{self.model_name}'") - - self.text_model = ort.InferenceSession( - self.textual_path.as_posix(), - sess_options=self.sess_options, - providers=self.providers, - provider_options=self.provider_options, - ) + self.text_model = self._make_session(self.textual_path) log.debug(f"Loaded clip text model '{self.model_name}'") if self.mode == "vision" or self.mode is None: log.debug(f"Loading clip vision model '{self.model_name}'") - - self.vision_model = ort.InferenceSession( - self.visual_path.as_posix(), - sess_options=self.sess_options, - providers=self.providers, - provider_options=self.provider_options, - ) + self.vision_model = self._make_session(self.visual_path) log.debug(f"Loaded clip vision model '{self.model_name}'") def _predict(self, image_or_text: Image.Image | str) -> ndarray_f32: @@ -61,12 +48,10 @@ class BaseCLIPEncoder(InferenceModel): case Image.Image(): if self.mode == "text": raise TypeError("Cannot encode image as text-only model") - outputs: ndarray_f32 = self.vision_model.run(None, self.transform(image_or_text))[0][0] case str(): if self.mode == "vision": raise TypeError("Cannot encode text as vision-only model") - outputs = self.text_model.run(None, self.tokenize(image_or_text))[0][0] case _: raise TypeError(f"Expected Image or str, but got: {type(image_or_text)}") diff --git a/machine-learning/app/models/facial_recognition.py b/machine-learning/app/models/facial_recognition.py index 24719eb83a..e64de9f65c 100644 --- a/machine-learning/app/models/facial_recognition.py +++ b/machine-learning/app/models/facial_recognition.py @@ -3,7 +3,6 @@ from typing import Any import cv2 import numpy as np -import onnxruntime as ort from insightface.model_zoo import ArcFaceONNX, RetinaFace from insightface.utils.face_align import norm_crop @@ -27,23 +26,8 @@ class FaceRecognizer(InferenceModel): super().__init__(clean_name(model_name), cache_dir, **model_kwargs) def _load(self) -> None: - self.det_model = RetinaFace( - session=ort.InferenceSession( - self.det_file.as_posix(), - sess_options=self.sess_options, - providers=self.providers, - provider_options=self.provider_options, - ), - ) - self.rec_model = ArcFaceONNX( - self.rec_file.as_posix(), - session=ort.InferenceSession( - self.rec_file.as_posix(), - sess_options=self.sess_options, - providers=self.providers, - provider_options=self.provider_options, - ), - ) + self.det_model = RetinaFace(session=self._make_session(self.det_file)) + self.rec_model = ArcFaceONNX(self.rec_file.as_posix(), session=self._make_session(self.rec_file)) self.det_model.prepare( ctx_id=0, diff --git a/machine-learning/app/test_main.py b/machine-learning/app/test_main.py index 3ab0e4c0de..2f0804f263 100644 --- a/machine-learning/app/test_main.py +++ b/machine-learning/app/test_main.py @@ -13,7 +13,7 @@ from PIL import Image from pytest_mock import MockerFixture from .config import settings -from .models.base import PicklableSessionOptions +from .models.base import InferenceModel, PicklableSessionOptions from .models.cache import ModelCache from .models.clip import OpenCLIPEncoder from .models.facial_recognition import FaceRecognizer @@ -36,9 +36,10 @@ class TestCLIP: mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + + mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value + mocked.run.return_value = [[self.embedding]] mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True) - mocked = mocker.patch("app.models.clip.ort.InferenceSession", autospec=True) - mocked.return_value.run.return_value = [[self.embedding]] clip_encoder = OpenCLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="vision") embedding = clip_encoder.predict(pil_image) @@ -47,7 +48,7 @@ class TestCLIP: assert isinstance(embedding, np.ndarray) assert embedding.shape[0] == clip_model_cfg["embed_dim"] assert embedding.dtype == np.float32 - clip_encoder.vision_model.run.assert_called_once() + mocked.run.assert_called_once() def test_basic_text( self, @@ -60,9 +61,10 @@ class TestCLIP: mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + + mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value + mocked.run.return_value = [[self.embedding]] mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True) - mocked = mocker.patch("app.models.clip.ort.InferenceSession", autospec=True) - mocked.return_value.run.return_value = [[self.embedding]] clip_encoder = OpenCLIPEncoder("ViT-B-32::openai", cache_dir="test_cache", mode="text") embedding = clip_encoder.predict("test search query") @@ -71,7 +73,7 @@ class TestCLIP: assert isinstance(embedding, np.ndarray) assert embedding.shape[0] == clip_model_cfg["embed_dim"] assert embedding.dtype == np.float32 - clip_encoder.text_model.run.assert_called_once() + mocked.run.assert_called_once() class TestFaceRecognition: