Android: Voice typing: Add setting to allow specifying a glossary (#12370)

Co-authored-by: Laurent Cozic <laurent22@users.noreply.github.com>
2025-08-10 22:11:50 +02:00 · 2025-06-28 12:06:12 -07:00
parent 1644f56447
commit 6a5c85d3d7
5 changed files with 112 additions and 18 deletions
--- a/packages/app-mobile/services/voiceTyping/whisper.test.ts
+++ b/packages/app-mobile/services/voiceTyping/whisper.test.ts
@@ -2,28 +2,62 @@ import { setupDatabase } from '@joplin/lib/testing/test-utils';
 import whisper from './whisper';
 import { dirname, join } from 'path';
 import { exists, mkdir, remove, writeFile } from 'fs-extra';
+import Setting from '@joplin/lib/models/Setting';
+import { NativeModules } from 'react-native';
+const SpeechToTextModule = NativeModules.SpeechToTextModule;

 jest.mock('react-native', () => {
 	const reactNative = jest.requireActual('react-native');

+	let lastPrompt: string|null = null;
+
 	// Set properties on reactNative rather than creating a new object with
 	// {...reactNative, ...}. Creating a new object triggers deprecation warnings.
 	// See https://github.com/facebook/react-native/issues/28839.
 	reactNative.NativeModules.SpeechToTextModule = {
 		convertNext: () => 'Test. This is test output. Test!',
 		runTests: ()=> {},
-		openSession: jest.fn(() => {
+		openSession: jest.fn((_path, _locale, prompt) => {
+			lastPrompt = prompt;
+
 			const someId = 1234;
 			return someId;
 		}),
 		closeSession: jest.fn(),
 		startRecording: jest.fn(),
 		convertAvailable: jest.fn(() => ''),
+		testing__lastPrompt: () => {
+			return lastPrompt;
+		},
 	};

 	return reactNative;
 });

+interface ModelConfig {
+	output: {
+		stringReplacements: string[][];
+		regexReplacements: string[][];
+	};
+}
+
+const defaultModelConfig: ModelConfig = {
+	output: { stringReplacements: [], regexReplacements: [] },
+};
+
+const createMockModel = async (config: ModelConfig = defaultModelConfig) => {
+	const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
+	await mkdir(whisperBaseDirectory);
+
+	const modelDirectory = join(whisperBaseDirectory, 'model');
+	await mkdir(modelDirectory);
+
+	await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
+	await writeFile(join(modelDirectory, 'config.json'), JSON.stringify(config), 'utf-8');
+
+	return modelDirectory;
+};
+
 describe('whisper', () => {
 	beforeEach(async () => {
 		await setupDatabase(0);
@@ -45,14 +79,7 @@ describe('whisper', () => {
 	});

 	test('should apply post-processing replacements specified in the model config', async () => {
-		const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
-		await mkdir(whisperBaseDirectory);
-
-		const modelDirectory = join(whisperBaseDirectory, 'model');
-		await mkdir(modelDirectory);
-
-		await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
-		await writeFile(join(modelDirectory, 'config.json'), JSON.stringify({
+		const modelDirectory = await createMockModel({
 			output: {
 				stringReplacements: [
 					['Test', 'replaced'],
@@ -61,7 +88,7 @@ describe('whisper', () => {
 					['replace[d]', 'replaced again!'],
 				],
 			},
-		}), 'utf-8');
+		});

 		let lastFinalizedText = '';
 		const onFinalize = jest.fn((text: string) => {
@@ -85,4 +112,29 @@ describe('whisper', () => {
 			lastFinalizedText,
 		).toBe('\n\nreplaced again!. This is test output. replaced again!!');
 	});
+
+	it.each([
+		{ glossary: '', expectedPrompt: '' },
+		{ glossary: 'test', expectedPrompt: 'Glossary: test' },
+		{ glossary: 'Joplin, app', expectedPrompt: 'Glossary: Joplin, app' },
+		// Should not include the "Glossary:" prefix if there's no translation for it
+		{ glossary: 'Joplin, app', expectedPrompt: 'Joplin, app', locale: 'testLocale-test' },
+	])('should construct a prompt from the user-specified glossary (%j)', async ({ glossary, expectedPrompt, locale }) => {
+		Setting.setValue('voiceTyping.glossary', glossary);
+
+		const modelDirectory = await createMockModel();
+		const session = await whisper.build({
+			modelPath: modelDirectory,
+			callbacks: {
+				onFinalize: () => {
+					return session.stop();
+				},
+				onPreview: jest.fn(),
+			},
+			locale: locale ?? 'en',
+		});
+		await session.start();
+
+		expect(SpeechToTextModule.testing__lastPrompt()).toBe(expectedPrompt);
+	});
 });
--- a/packages/app-mobile/services/voiceTyping/whisper.ts
+++ b/packages/app-mobile/services/voiceTyping/whisper.ts
@@ -5,7 +5,7 @@ import { rtrimSlashes } from '@joplin/utils/path';
 import { dirname, join } from 'path';
 import { NativeModules } from 'react-native';
 import { SpeechToTextCallbacks, VoiceTypingProvider, VoiceTypingSession } from './VoiceTyping';
-import { languageCodeOnly } from '@joplin/lib/locale';
+import { languageCodeOnly, stringByLocale } from '@joplin/lib/locale';

 const logger = Logger.create('voiceTyping/whisper');

@@ -178,8 +178,30 @@ class Whisper implements VoiceTypingSession {
 	}
 }

+const getGlossaryPrompt = (locale: string) => {
+	const glossary = Setting.value('voiceTyping.glossary');
+	if (!glossary) return '';
+
+	// Re-define the "_" localization function so that it uses the transcription locale (as opposed to the UI locale).
+	const _ = (text: string) => {
+		return stringByLocale(locale, text);
+	};
+	let glossaryPrefix = _('Glossary:');
+
+	// Prefer no prefix if no appropriate translation of "Glossary:" is available:
+	if (glossaryPrefix === 'Glossary:' && languageCodeOnly(locale) !== 'en') {
+		glossaryPrefix = '';
+	}
+
+	return `${glossaryPrefix} ${glossary}`.trim();
+};
+
 const getPrompt = (locale: string, localeToPrompt: Map<string, string>) => {
-	return localeToPrompt.get(languageCodeOnly(locale)) ?? '';
+	const basePrompt = localeToPrompt.get(languageCodeOnly(locale));
+	return [
+		basePrompt,
+		getGlossaryPrompt(locale),
+	].filter(part => !!part).join(' ');
 };

 const modelLocalDirectory = () => {
--- a/packages/lib/locale.ts
+++ b/packages/lib/locale.ts
@@ -733,4 +733,4 @@ const stringByLocale = (locale: string, s: string, ...args: any[]): string => {
 	}
 };

-export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, languageCode, countryCodeOnly };
+export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, stringByLocale, languageCode, countryCodeOnly };
--- a/packages/lib/models/settings/builtInMetadata.ts
+++ b/packages/lib/models/settings/builtInMetadata.ts
@@ -14,6 +14,11 @@ const customCssFilePath = (Setting: typeof SettingType, filename: string): strin
 	return `${Setting.value('rootProfileDir')}/${filename}`;
 };

+const showVoiceTypingSettings = () => (
+	// For now, iOS and web don't support voice typing.
+	shim.mobilePlatform() === 'android'
+);
+
 export enum CameraDirection {
 	Back,
 	Front,
@@ -1803,8 +1808,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
 			appTypes: [AppType.Mobile],
 			description: () => _('Leave it blank to download the language files from the default website'),
 			label: () => _('Voice typing language files (URL)'),
-			// For now, iOS and web don't support voice typing.
-			show: () => shim.mobilePlatform() === 'android',
+			show: showVoiceTypingSettings,
 			section: 'note',
 		},

@@ -1815,8 +1819,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
 			appTypes: [AppType.Mobile],
 			label: () => _('Preferred voice typing provider'),
 			isEnum: true,
-			// For now, iOS and web don't support voice typing.
-			show: () => shim.mobilePlatform() === 'android',
+			show: showVoiceTypingSettings,
 			section: 'note',

 			options: () => {
@@ -1827,6 +1830,17 @@ const builtInMetadata = (Setting: typeof SettingType) => {
 			},
 		},

+		'voiceTyping.glossary': {
+			value: '',
+			type: SettingItemType.String,
+			public: true,
+			appTypes: [AppType.Mobile],
+			label: () => _('Voice typing: Glossary'),
+			description: () => _('A comma-separated list of words. May be used for uncommon words, to help voice typing spell them correctly.'),
+			show: (settings) => showVoiceTypingSettings() && settings['voiceTyping.preferredProvider'].startsWith('whisper'),
+			section: 'note',
+		},
+
 		'trash.autoDeletionEnabled': {
 			value: true,
 			type: SettingItemType.Bool,
--- a/readme/dev/spec/voice_typing.md
+++ b/readme/dev/spec/voice_typing.md
@@ -10,6 +10,12 @@ By default, Joplin uses Whisper.cpp for voice typing.

 Whisper.cpp provides a number of pre-trained models for transcribing speech in different languages. Both [English-only and multilingual models](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages) are available. The multilingual models support a variety of different languages. Joplin uses the smallest of the multilingual models by default.

+### Preventing spelling mistakes
+
+Joplin allows specifying a glossary for voice typing using the "Voice typing: Glossary" setting (in the "Note" section of settings). Including uncommon words in the glossary makes voice typing more likely to spell them correctly. For example, providing `Scott Joplin, ragtime.` as the glossary helps voice typing correctly spell "Scott Joplin" and "ragtime".
+
+Internally, this is implemented using [prompting](https://cookbook.openai.com/examples/whisper_prompting_guide#pass-names-in-the-prompt-to-prevent-misspellings).
+
 ### Downloading the models

 By default, Joplin downloads Whisper models from [this GitHub repository](https://github.com/joplin/voice-typing-models). It's possible to download models from a custom location by changing the **Voice typing language files (URL)** in from the "Note" tab of the configuration screen.
@@ -72,4 +78,4 @@ You can also configure the application to download the models from your own serv

 * **Provide the base URL**, eg `https://example.com/models`. Then Joplin will automatically append the filename to that URL, for example it will download the French files from `https://example.com/models/fr.zip`

-* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`
+* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`