update

2026-03-12 10:00:05 +02:00 · 2026-02-18 19:11:03 +00:00 · 2026-02-18 17:56:09 +00:00 · 2026-02-18 17:07:42 +00:00
12 changed files with 268 additions and 16 deletions
--- a/.eslintrc.js
+++ b/.eslintrc.js
@@ -214,6 +214,7 @@ module.exports = {
 				'packages/tools/**',
 				'packages/app-mobile/tools/**',
 				'packages/app-desktop/tools/**',
+				'packages/transcribe/src/tools/**',
 			],
 			'rules': {
 				'no-console': 'off',
--- a/packages/tools/cspell/dictionary4.txt
+++ b/packages/tools/cspell/dictionary4.txt
@@ -250,4 +250,8 @@ mrjo
 codegen
 analyzed
 Perfetto
-appmodules
+appmodules
+cuda
+CUDA
+mtmd
+gguf
--- a/packages/transcribe/.gitignore
+++ b/packages/transcribe/.gitignore
@@ -5,4 +5,5 @@ images/*
 models/
 *.sqlite3
 *.sqlite-journal
-.env
+.env
+htr-metal/
--- a/packages/transcribe/Dockerfile.htr-cli-gpu
+++ b/packages/transcribe/Dockerfile.htr-cli-gpu
@@ -0,0 +1,24 @@
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+RUN apt-get update && apt-get install -y \
+    wget \
+    unzip \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64-cuda-12.zip
+
+RUN mkdir /models/
+RUN wget -q -O /models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf
+RUN wget -q -O /models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf
+
+WORKDIR /app
+RUN unzip llama-b5449-bin-ubuntu-x64-cuda-12.zip
+WORKDIR /app/build/bin
+
+# Create an entrypoint script
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
--- a/packages/transcribe/README.md
+++ b/packages/transcribe/README.md
@@ -15,6 +15,84 @@ docker run --env-file .env-transcribe -p 4567:4567 \
 	transcribe
 ```

+## GPU Acceleration
+
+By default the server runs inference on CPU. Set `HTR_CLI_GPU_TYPE` in your `.env` to enable GPU acceleration.
+
+| Value | Hardware | Requires |
+|-------|----------|---------|
+| `none` | CPU (default) | Nothing extra |
+| `cuda` | NVIDIA GPU | NVIDIA Docker runtime (`nvidia-container-toolkit`) |
+| `metal` | Apple Silicon | Native binary (no Docker for inference) |
+
+### NVIDIA CUDA
+
+1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) on the host.
+2. Build the GPU Docker image:
+
+   ```shell
+   docker build -f packages/transcribe/Dockerfile.htr-cli-gpu -t joplin/htr-cli-gpu:latest .
+   ```
+
+3. Add these variables to your `.env`:
+
+   ```env
+   HTR_CLI_GPU_TYPE=cuda
+   HTR_CLI_DOCKER_IMAGE=joplin/htr-cli-gpu:latest
+   ```
+
+4. Start the transcribe container with `--gpus all`:
+
+   ```shell
+   docker run --env-file .env-transcribe -p 4567:4567 \
+       --gpus all \
+       -v /var/run/docker.sock:/var/run/docker.sock \
+       -v ./packages/transcribe/images:/app/packages/transcribe/images \
+       transcribe
+   ```
+
+### Apple Silicon (Metal)
+
+Metal GPU access is not available inside Docker containers on macOS, so the inference binary runs natively on the host instead of in a container.
+
+1. Run the setup script to download the native binary and model files (from `packages/transcribe`):
+
+   ```shell
+   yarn setupMetal
+   ```
+
+   An optional `--install-dir` argument controls where files are downloaded (default: `./htr-metal`):
+
+   ```shell
+   yarn setupMetal --install-dir /opt/htr-metal
+   ```
+
+   The script prints the exact `.env` lines to add, for example:
+
+   ```env
+   HTR_CLI_GPU_TYPE=metal
+   HTR_CLI_BINARY_PATH=/path/to/htr-metal/bin/llama-mtmd-cli
+   HTR_CLI_MODELS_FOLDER=/path/to/htr-metal/models
+   ```
+
+2. Add those lines to your `.env`.
+
+3. Start the server normally — `HTR_CLI_IMAGES_FOLDER` still needs to be set and accessible to the native binary.
+
+Here's a sample `.env` file for testing:
+
+```ini
+# Copy these lines from the output of `yarn setupMetal`
+HTR_CLI_GPU_TYPE=metal
+HTR_CLI_BINARY_PATH=
+HTR_CLI_MODELS_FOLDER=
+
+HTR_CLI_IMAGES_FOLDER=/path/to/images
+API_KEY=test-key
+QUEUE_DRIVER=sqlite
+QUEUE_DATABASE_NAME=./queue.sqlite3
+```
+
 ## Using Docker Compose

 The minimal configuration is provided in `.env-sample` and `docker-compose.server.yml`.
--- a/packages/transcribe/package.json
+++ b/packages/transcribe/package.json
@@ -6,6 +6,7 @@
    "rebuild": "yarn clean && yarn build && yarn tsc",
    "build": "gulp build",
    "start": "node dist/src/api/app.js",
+    "setupMetal": "node dist/src/tools/setupMetal.js",
    "tsc": "tsc --project tsconfig.json",
    "test": "jest --verbose=false",
    "test-all": "TRANSCRIBE_RUN_ALL=1 jest --verbose=false",
--- a/packages/transcribe/src/api/app.ts
+++ b/packages/transcribe/src/api/app.ts
@@ -34,7 +34,13 @@ const init = async (logger: LoggerWrapper) => {
 	app.context.queue = queue;
 	app.context.storage = fileStorage;

-	const htrCli = new HtrCli(envVariables.HTR_CLI_DOCKER_IMAGE, envVariables.HTR_CLI_IMAGES_FOLDER);
+	const htrCli = new HtrCli({
+		htrCliDockerImage: envVariables.HTR_CLI_DOCKER_IMAGE,
+		htrCliImagesFolder: envVariables.HTR_CLI_IMAGES_FOLDER,
+		gpuType: envVariables.HTR_CLI_GPU_TYPE as import('../core/HtrCli').GpuType,
+		binaryPath: envVariables.HTR_CLI_BINARY_PATH,
+		modelsFolder: envVariables.HTR_CLI_MODELS_FOLDER,
+	});

 	const jobProcessor = new JobProcessor(queue, htrCli, fileStorage);

@@ -46,6 +52,10 @@ const init = async (logger: LoggerWrapper) => {
 const checkServerConfigurations = (envVariables: EnvVariables) => {
 	if (!envVariables.API_KEY) throw Error('API_KEY environment variable not set.');
 	if (!envVariables.HTR_CLI_IMAGES_FOLDER) throw Error('HTR_CLI_IMAGES_FOLDER environment variable not set. This should point to a folder where images will be stored.');
+	if (envVariables.HTR_CLI_GPU_TYPE === 'metal') {
+		if (!envVariables.HTR_CLI_BINARY_PATH) throw Error('HTR_CLI_BINARY_PATH environment variable not set. This should point to the native llama-mtmd-cli binary for Metal GPU mode.');
+		if (!envVariables.HTR_CLI_MODELS_FOLDER) throw Error('HTR_CLI_MODELS_FOLDER environment variable not set. This should point to the folder containing the model files for Metal GPU mode.');
+	}
 };

 const main = async () => {
--- a/packages/transcribe/src/core/HtrCli.test.ts
+++ b/packages/transcribe/src/core/HtrCli.test.ts
@@ -2,7 +2,7 @@ import { readFile } from 'fs-extra';
 import HtrCli from './HtrCli';

 describe('HtrCli', () => {
-	const dt = new HtrCli('', '');
+	const dt = new HtrCli({ htrCliDockerImage: '', htrCliImagesFolder: '', gpuType: 'none' });
 	it('should parse multiline result', async () => {
 		const testCase = await readFile('./test-cases/1.txt');
 		const result = dt.cleanUpResult(testCase.toString());
--- a/packages/transcribe/src/core/HtrCli.ts
+++ b/packages/transcribe/src/core/HtrCli.ts
@@ -4,26 +4,48 @@ import { WorkHandler } from '../types';

 const logger = Logger.create('HtrCli');

+export type GpuType = 'none' | 'cuda' | 'metal';
+
+const systemPrompt = 'SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: ```{{example}}```. If you find no text, output ``````.. Your turn:';
+
+export interface HtrCliOptions {
+	htrCliDockerImage: string;
+	htrCliImagesFolder: string;
+	gpuType: GpuType;
+	// Required when gpuType is 'metal'
+	binaryPath?: string;
+	modelsFolder?: string;
+}
+
 export default class HtrCli implements WorkHandler {

-	private htrCliDockerImage: string;
-	private htrCliImagesFolder: string;
+	private options: HtrCliOptions;

-	public constructor(htrCliDockerImage: string, htrCliImagesFolder: string) {
-		this.htrCliDockerImage = htrCliDockerImage;
-		this.htrCliImagesFolder = htrCliImagesFolder;
+	public constructor(options: HtrCliOptions) {
+		this.options = options;
 	}

 	public async init() {
+		if (this.options.gpuType === 'metal') {
+			logger.info('Metal GPU mode: skipping Docker image pull (native binary)');
+			return;
+		}
 		logger.info('Loading');
-		const result = await execCommand(['docker', 'pull', this.htrCliDockerImage], { quiet: true });
+		const result = await execCommand(['docker', 'pull', this.options.htrCliDockerImage], { quiet: true });
 		logger.info('Finished loading: ', result);
 	}

 	public async run(imageName: string) {
-		const command = ['docker', 'run', '--rm', '-t', '-v', `${this.htrCliImagesFolder}:/images`, this.htrCliDockerImage, imageName];
-
 		logger.info('Running transcription...');
+
+		let command: string[];
+
+		if (this.options.gpuType === 'metal') {
+			command = this.buildMetalCommand(imageName);
+		} else {
+			command = this.buildDockerCommand(imageName);
+		}
+
 		logger.info(`Command: ${commandToString(command[0], command.slice(1))}`);
 		const result = await execCommand(command, { quiet: true });

@@ -31,6 +53,27 @@ export default class HtrCli implements WorkHandler {
 		return this.cleanUpResult(result);
 	}

+	private buildDockerCommand(imageName: string): string[] {
+		const gpuFlags = this.options.gpuType === 'cuda' ? ['--gpus', 'all'] : [];
+		return ['docker', 'run', '--rm', '-t', ...gpuFlags, '-v', `${this.options.htrCliImagesFolder}:/images`, this.options.htrCliDockerImage, imageName];
+	}
+
+	private buildMetalCommand(imageName: string): string[] {
+		const { binaryPath = '', modelsFolder = '', htrCliImagesFolder } = this.options;
+		return [
+			binaryPath,
+			'-m', `${modelsFolder}/Model-7.6B-Q4_K_M.gguf`,
+			'--mmproj', `${modelsFolder}/mmproj-model-f16.gguf`,
+			'-c', '4096',
+			'--temp', '0.05',
+			'--top-p', '0.8',
+			'--top-k', '100',
+			'--repeat-penalty', '1.05',
+			'--image', `${htrCliImagesFolder}/${imageName}`,
+			'-p', systemPrompt,
+		];
+	}
+
 	public cleanUpResult(transcriptionAndLogs: string) {
 		const s1 = transcriptionAndLogs.split(/image decoded.*/);
 		// Before the last `image decoded` line it is all logs generated by the transcription tool
--- a/packages/transcribe/src/env.ts
+++ b/packages/transcribe/src/env.ts
@@ -8,6 +8,9 @@ export const defaultEnvValues: EnvVariables = {
 	QUEUE_MAINTENANCE_INTERVAL: 60 * Second,
 	HTR_CLI_DOCKER_IMAGE: 'joplin/htr-cli:latest',
 	HTR_CLI_IMAGES_FOLDER: '',
+	HTR_CLI_GPU_TYPE: 'none', // 'none' | 'cuda' | 'metal'
+	HTR_CLI_BINARY_PATH: '', // Path to native llama-mtmd-cli binary (required for metal)
+	HTR_CLI_MODELS_FOLDER: '', // Path to models directory (required for metal)
 	QUEUE_DRIVER: 'pg', // 'sqlite'
 	QUEUE_DATABASE_PASSWORD: '',
 	QUEUE_DATABASE_NAME: '',
@@ -27,6 +30,9 @@ export interface EnvVariables {
 	QUEUE_MAINTENANCE_INTERVAL: number;
 	HTR_CLI_DOCKER_IMAGE: string;
 	HTR_CLI_IMAGES_FOLDER: string;
+	HTR_CLI_GPU_TYPE: string;
+	HTR_CLI_BINARY_PATH: string;
+	HTR_CLI_MODELS_FOLDER: string;
 	QUEUE_DRIVER: string;
 	QUEUE_DATABASE_PASSWORD: string;
 	QUEUE_DATABASE_NAME: string;
--- a/packages/transcribe/src/tools/setupMetal.ts
+++ b/packages/transcribe/src/tools/setupMetal.ts
@@ -0,0 +1,84 @@
+// Downloads the native llama.cpp binary and model files required for Apple Silicon (Metal) GPU mode.
+// Run once to set up the Metal environment, then configure .env accordingly.
+//
+// Usage:
+//   npm run setup-metal [-- --install-dir ./htr-metal]
+
+import { fetchWithRetry } from '@joplin/utils/net';
+import { execCommand } from '@joplin/utils';
+import * as fs from 'fs-extra';
+import { join, resolve } from 'path';
+import { createWriteStream } from 'fs';
+import { pipeline } from 'stream/promises';
+
+const LLAMA_RELEASE = 'b5449';
+const LLAMA_ZIP = `llama-${LLAMA_RELEASE}-bin-macos-arm64.zip`;
+const LLAMA_URL = `https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_RELEASE}/${LLAMA_ZIP}`;
+const MODEL_BASE_URL = 'https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main';
+
+const downloadFile = async (url: string, destPath: string) => {
+	console.info(`Downloading ${url} ...`);
+	const response = await fetchWithRetry(url, { retry: 3, pause: 2000 });
+	if (!response || !response.ok) throw new Error(`Failed to download ${url}: ${response?.status} ${response?.statusText}`);
+	await pipeline(response.body, createWriteStream(destPath));
+};
+
+const findBinary = async (dir: string, name: string): Promise<string> => {
+	const entries = await fs.readdir(dir, { withFileTypes: true });
+	for (const entry of entries) {
+		const fullPath = join(dir, entry.name);
+		if (entry.isDirectory()) {
+			const found = await findBinary(fullPath, name).catch(() => '');
+			if (found) return found;
+		} else if (entry.name === name) {
+			return fullPath;
+		}
+	}
+	return '';
+};
+
+const main = async () => {
+	const args = process.argv.slice(2);
+	const installDirArg = args.indexOf('--install-dir');
+	const installDir = resolve(installDirArg >= 0 ? args[installDirArg + 1] : './htr-metal');
+
+	const binDir = join(installDir, 'bin');
+	const modelsDir = join(installDir, 'models');
+
+	await fs.mkdirp(binDir);
+	await fs.mkdirp(modelsDir);
+
+	// Download and extract llama.cpp macOS ARM binary
+	const zipPath = join(installDir, LLAMA_ZIP);
+	console.info(`\nDownloading llama.cpp macOS ARM binary (${LLAMA_RELEASE})...`);
+	await downloadFile(LLAMA_URL, zipPath);
+	console.info('Extracting...');
+	await execCommand(['unzip', '-o', zipPath, '-d', binDir]);
+	await fs.remove(zipPath);
+
+	// Download model files
+	console.info('\nDownloading model files...');
+	await downloadFile(
+		`${MODEL_BASE_URL}/Model-7.6B-Q4_K_M.gguf`,
+		join(modelsDir, 'Model-7.6B-Q4_K_M.gguf'),
+	);
+	await downloadFile(
+		`${MODEL_BASE_URL}/mmproj-model-f16.gguf`,
+		join(modelsDir, 'mmproj-model-f16.gguf'),
+	);
+
+	// Find the binary
+	const binaryPath = await findBinary(binDir, 'llama-mtmd-cli');
+	if (!binaryPath) throw new Error('llama-mtmd-cli binary not found after extraction.');
+	await fs.chmod(binaryPath, 0o755);
+
+	console.info('\nSetup complete. Add these variables to your .env file:\n');
+	console.info('HTR_CLI_GPU_TYPE=metal');
+	console.info(`HTR_CLI_BINARY_PATH=${resolve(binaryPath)}`);
+	console.info(`HTR_CLI_MODELS_FOLDER=${resolve(modelsDir)}`);
+};
+
+main().catch(error => {
+	console.error('Fatal error:', error);
+	process.exit(1);
+});
--- a/packages/transcribe/src/workers/JobProcessor.test.ts
+++ b/packages/transcribe/src/workers/JobProcessor.test.ts
@@ -36,7 +36,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should execute work on job in the queue', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliDockerImage: 'joplin/htr-cli:latest', htrCliImagesFolder: join(process.cwd(), 'images'), gpuType: 'none' }), new FileStorage(), 1000);
 		await tw.init();

 		await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy.png'));
@@ -59,7 +59,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should execute work on job in the queue even if one fails', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliDockerImage: 'joplin/htr-cli:latest', htrCliImagesFolder: join(process.cwd(), 'images'), gpuType: 'none' }), new FileStorage(), 1000);
 		await tw.init();
 		await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy_2.png'));

@@ -84,7 +84,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should remove file sent to queue if job is completed', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliDockerImage: 'joplin/htr-cli:latest', htrCliImagesFolder: join(process.cwd(), 'images'), gpuType: 'none' }), new FileStorage(), 1000);
 		await tw.init();
 		const imagePath = join('images', 'htr_sample_copy_3.png');
 		await copy(join('images', 'htr_sample.png'), imagePath);
@@ -112,7 +112,7 @@ describe('JobProcessor', () => {
 		const fileStorage = new FileStorage();
 		const mockedFileStorageRemove = jest.fn();
 		fileStorage.remove = mockedFileStorageRemove;
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), fileStorage, 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliDockerImage: 'joplin/htr-cli:latest', htrCliImagesFolder: join(process.cwd(), 'images'), gpuType: 'none' }), fileStorage, 1000);
 		await tw.init();

 		// file doesn't exist to force a fail, but the call to remove the file should still exist
Author	SHA1	Message	Date
Laurent Cozic	a747996d8c	update	2026-02-18 19:11:03 +00:00
Laurent Cozic	0877d6e9cd	update	2026-02-18 17:56:09 +00:00
Laurent Cozic	66aa47a5ca	update	2026-02-18 17:07:42 +00:00