feat(ml): improved ARM-NN support (#11233)

2025-03-06 16:16:48 +02:00 · 2024-07-20 21:59:27 +02:00 · 2024-07-20 21:59:27 +02:00 · 54488b1016
commit 54488b1016
parent 7c3326b662
8 changed files with 70 additions and 32 deletions
--- a/docs/docs/features/ml-hardware-acceleration.md
+++ b/docs/docs/features/ml-hardware-acceleration.md
@ -32,6 +32,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
  - Where and how you can get this file depends on device and vendor, but typically, the device vendor also supplies these
  - The `hwaccel.ml.yml` file assumes the path to it is `/usr/lib/libmali.so`, so update accordingly if it is elsewhere
  - The `hwaccel.ml.yml` file assumes an additional file `/lib/firmware/mali_csffw.bin`, so update accordingly if your device's driver does not require this file
+- Optional: Configure your `.env` file, see [environment variables](/docs/install/environment-variables) for ARM NN specific settings

 #### CUDA

--- a/docs/docs/install/environment-variables.md
+++ b/docs/docs/install/environment-variables.md
@ -156,18 +156,21 @@ Redis (Sentinel) URL example JSON before encoding:

 ## Machine Learning

-| Variable                                         | Description                                                          |                Default                | Containers       |
-| :----------------------------------------------- | :------------------------------------------------------------------- | :-----------------------------------: | :--------------- |
-| `MACHINE_LEARNING_MODEL_TTL`                     | Inactivity time (s) before a model is unloaded (disabled if \<= 0)   |                 `300`                 | machine learning |
-| `MACHINE_LEARNING_MODEL_TTL_POLL_S`              | Interval (s) between checks for the model TTL (disabled if \<= 0)    |                 `10`                  | machine learning |
-| `MACHINE_LEARNING_CACHE_FOLDER`                  | Directory where models are downloaded                                |               `/cache`                | machine learning |
-| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0)          |          number of CPU cores          | machine learning |
-| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS`        | Number of parallel model operations                                  |                  `1`                  | machine learning |
-| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS`        | Number of threads for each model operation                           |                  `2`                  | machine learning |
-| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup>         | Number of worker processes to spawn                                  |                  `1`                  | machine learning |
-| `MACHINE_LEARNING_WORKER_TIMEOUT`                | Maximum time (s) of unresponsiveness before a worker is killed       | `120` (`300` if using OpenVINO image) | machine learning |
-| `MACHINE_LEARNING_PRELOAD__CLIP`                 | Name of a CLIP model to be preloaded and kept in cache               |                                       | machine learning |
-| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION`   | Name of a facial recognition model to be preloaded and kept in cache |                                       | machine learning |
+| Variable                                         | Description                                                                                         |                Default                | Containers       |
+| :----------------------------------------------- | :-------------------------------------------------------------------------------------------------- | :-----------------------------------: | :--------------- |
+| `MACHINE_LEARNING_MODEL_TTL`                     | Inactivity time (s) before a model is unloaded (disabled if \<= 0)                                  |                 `300`                 | machine learning |
+| `MACHINE_LEARNING_MODEL_TTL_POLL_S`              | Interval (s) between checks for the model TTL (disabled if \<= 0)                                   |                 `10`                  | machine learning |
+| `MACHINE_LEARNING_CACHE_FOLDER`                  | Directory where models are downloaded                                                               |               `/cache`                | machine learning |
+| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0)                                         |          number of CPU cores          | machine learning |
+| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS`        | Number of parallel model operations                                                                 |                  `1`                  | machine learning |
+| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS`        | Number of threads for each model operation                                                          |                  `2`                  | machine learning |
+| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup>         | Number of worker processes to spawn                                                                 |                  `1`                  | machine learning |
+| `MACHINE_LEARNING_WORKER_TIMEOUT`                | Maximum time (s) of unresponsiveness before a worker is killed                                      | `120` (`300` if using OpenVINO image) | machine learning |
+| `MACHINE_LEARNING_PRELOAD__CLIP`                 | Name of a CLIP model to be preloaded and kept in cache                                              |                                       | machine learning |
+| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION`   | Name of a facial recognition model to be preloaded and kept in cache                                |                                       | machine learning |
+| `MACHINE_LEARNING_ANN`                           | Enable ARM-NN hardware acceleration if supported                                                    |                `True`                 | machine learning |
+| `MACHINE_LEARNING_ANN_FP16_TURBO`                | Execute operations in FP16 precision: increasing speed, reducing precision (applies only to ARM-NN) |                `False`                | machine learning |
+| `MACHINE_LEARNING_ANN_TUNING_LEVEL`              | ARM-NN GPU tuning level (1: rapid, 2: normal, 3: exhaustive)                                        |                  `2`                  | machine learning |

 \*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones.

--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@ -13,7 +13,7 @@ FROM builder-cpu as builder-armnn
 ENV ARMNN_PATH=/opt/armnn
 COPY ann /opt/ann
 RUN mkdir /opt/armnn && \
-    curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \
+    curl -SL "https://github.com/ARM-software/armnn/releases/download/v24.05/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \
    cd /opt/ann && \
    sh build.sh

@ -54,7 +54,7 @@ FROM prod-cpu as prod-armnn

 ENV LD_LIBRARY_PATH=/opt/armnn

-RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd && \
+RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd libgomp1 && \
    rm -rf /var/lib/apt/lists/* && \
    mkdir --parents /etc/OpenCL/vendors && \
    echo "/usr/lib/libmali.so" > /etc/OpenCL/vendors/mali.icd && \
--- a/machine-learning/ann/ann.cpp
+++ b/machine-learning/ann/ann.cpp
@ -48,21 +48,22 @@ public:
             bool saveCachedNetwork,
             const char *cachedNetworkPath)
    {
-        INetworkPtr network = loadModel(modelPath);
-        IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
-        const IOInfos infos = getIOInfos(optNet.get());
-        NetworkId netId;
-        mutex.lock();
-        Status status = runtime->LoadNetwork(netId, std::move(optNet));
-        mutex.unlock();
-        if (status != Status::Success)
+        NetworkId netId = -2;
+        while (netId == -2)
        {
-            return -1;
+            try
+            {
+                netId = loadInternal(modelPath, fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
+            }
+            catch (InvalidArgumentException e)
+            {
+                // fp16 models do not support the forced fp16-turbo (runtime fp32->fp16 conversion)
+                if (fp16)
+                    fp16 = false;
+                else
+                    netId = -1;
+            }
        }
-        spinLock.lock();
-        ioInfos[netId] = infos;
-        mutexes.emplace(netId, std::make_unique<std::mutex>());
-        spinLock.unlock();
        return netId;
    }

@ -117,6 +118,8 @@ public:
    Ann(int tuningLevel, const char *tuningFile)
    {
        IRuntime::CreationOptions runtimeOptions;
+        runtimeOptions.m_ProfilingOptions.m_EnableProfiling = false;
+        runtimeOptions.m_ProfilingOptions.m_TimelineEnabled = false;
        BackendOptions backendOptions{"GpuAcc",
                                      {
                                          {"TuningLevel", tuningLevel},
@ -133,6 +136,30 @@ public:
    };

 private:
+    int loadInternal(const char *modelPath,
+                     bool fastMath,
+                     bool fp16,
+                     bool saveCachedNetwork,
+                     const char *cachedNetworkPath)
+    {
+        NetworkId netId = -1;
+        INetworkPtr network = loadModel(modelPath);
+        IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
+        const IOInfos infos = getIOInfos(optNet.get());
+        mutex.lock();
+        Status status = runtime->LoadNetwork(netId, std::move(optNet));
+        mutex.unlock();
+        if (status != Status::Success)
+        {
+            return -1;
+        }
+        spinLock.lock();
+        ioInfos[netId] = infos;
+        mutexes.emplace(netId, std::make_unique<std::mutex>());
+        spinLock.unlock();
+        return netId;
+    }
+
    INetworkPtr loadModel(const char *modelPath)
    {
        const auto path = std::string(modelPath);
@ -172,6 +199,8 @@ private:
        options.SetReduceFp32ToFp16(fp16);
        options.SetShapeInferenceMethod(shapeInferenceMethod);
        options.SetAllowExpandedDims(allowExpandedDims);
+        options.SetDebugToFileEnabled(false);
+        options.SetProfilingEnabled(false);

        BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}});
        if (cachedNetworkPath)
@ -232,8 +261,8 @@ private:
    IRuntime *runtime;
    std::map<NetworkId, IOInfos> ioInfos;
    std::map<NetworkId, std::unique_ptr<std::mutex>> mutexes; // mutex per network to not execute the same the same network concurrently
-    std::mutex mutex; // global mutex for load/unload calls to the runtime
-    SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps
+    std::mutex mutex;                                         // global mutex for load/unload calls to the runtime
+    SpinLock spinLock;                                        // fast spin lock to guard access to the ioInfos and mutexes maps
 };

 extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile)
--- a/machine-learning/ann/ann.py
+++ b/machine-learning/ann/ann.py
@ -120,6 +120,8 @@ class Ann(metaclass=_Singleton):
            save_cached_network,
            cached_network_path.encode() if cached_network_path is not None else None,
        )
+        if net_id < 0:
+            raise ValueError("Cannot load model!")

        self.input_shapes[net_id] = tuple(
            self.shape(net_id, input=True, index=i) for i in range(self.tensors(net_id, input=True))
--- a/machine-learning/app/config.py
+++ b/machine-learning/app/config.py
@ -30,6 +30,8 @@ class Settings(BaseSettings):
    model_inter_op_threads: int = 0
    model_intra_op_threads: int = 0
    ann: bool = True
+    ann_fp16_turbo: bool = False
+    ann_tuning_level: int = 2
    preload: PreloadModelData | None = None

    class Config:
--- a/machine-learning/app/sessions/ann.py
+++ b/machine-learning/app/sessions/ann.py
@ -20,12 +20,13 @@ class AnnSession:
    def __init__(self, model_path: Path, cache_dir: Path = settings.cache_folder) -> None:
        self.model_path = model_path
        self.cache_dir = cache_dir
-        self.ann = Ann(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
+        self.ann = Ann(tuning_level=settings.ann_tuning_level, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())

        log.info("Loading ANN model %s ...", model_path)
        self.model = self.ann.load(
            model_path.as_posix(),
            cached_network_path=model_path.with_suffix(".anncache").as_posix(),
+            fp16=settings.ann_fp16_turbo,
        )
        log.info("Loaded ANN model with ID %d", self.model)

--- a/machine-learning/app/test_main.py
+++ b/machine-learning/app/test_main.py
@ -268,9 +268,9 @@ class TestAnnSession:

        AnnSession(model_path, cache_dir)

-        ann_session.assert_called_once_with(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
+        ann_session.assert_called_once_with(tuning_level=2, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
        ann_session.return_value.load.assert_called_once_with(
-            model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix()
+            model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix(), fp16=False
        )
        info.assert_has_calls(
            [