1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-08-10 22:11:50 +02:00

Android: Voice typing: Add setting to allow specifying a glossary (#12370)

Co-authored-by: Laurent Cozic <laurent22@users.noreply.github.com>
This commit is contained in:
Henry Heino
2025-06-28 12:06:12 -07:00
committed by GitHub
parent 1644f56447
commit 6a5c85d3d7
5 changed files with 112 additions and 18 deletions

View File

@@ -2,28 +2,62 @@ import { setupDatabase } from '@joplin/lib/testing/test-utils';
import whisper from './whisper';
import { dirname, join } from 'path';
import { exists, mkdir, remove, writeFile } from 'fs-extra';
import Setting from '@joplin/lib/models/Setting';
import { NativeModules } from 'react-native';
const SpeechToTextModule = NativeModules.SpeechToTextModule;
jest.mock('react-native', () => {
const reactNative = jest.requireActual('react-native');
let lastPrompt: string|null = null;
// Set properties on reactNative rather than creating a new object with
// {...reactNative, ...}. Creating a new object triggers deprecation warnings.
// See https://github.com/facebook/react-native/issues/28839.
reactNative.NativeModules.SpeechToTextModule = {
convertNext: () => 'Test. This is test output. Test!',
runTests: ()=> {},
openSession: jest.fn(() => {
openSession: jest.fn((_path, _locale, prompt) => {
lastPrompt = prompt;
const someId = 1234;
return someId;
}),
closeSession: jest.fn(),
startRecording: jest.fn(),
convertAvailable: jest.fn(() => ''),
testing__lastPrompt: () => {
return lastPrompt;
},
};
return reactNative;
});
interface ModelConfig {
output: {
stringReplacements: string[][];
regexReplacements: string[][];
};
}
const defaultModelConfig: ModelConfig = {
output: { stringReplacements: [], regexReplacements: [] },
};
const createMockModel = async (config: ModelConfig = defaultModelConfig) => {
const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
await mkdir(whisperBaseDirectory);
const modelDirectory = join(whisperBaseDirectory, 'model');
await mkdir(modelDirectory);
await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
await writeFile(join(modelDirectory, 'config.json'), JSON.stringify(config), 'utf-8');
return modelDirectory;
};
describe('whisper', () => {
beforeEach(async () => {
await setupDatabase(0);
@@ -45,14 +79,7 @@ describe('whisper', () => {
});
test('should apply post-processing replacements specified in the model config', async () => {
const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
await mkdir(whisperBaseDirectory);
const modelDirectory = join(whisperBaseDirectory, 'model');
await mkdir(modelDirectory);
await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
await writeFile(join(modelDirectory, 'config.json'), JSON.stringify({
const modelDirectory = await createMockModel({
output: {
stringReplacements: [
['Test', 'replaced'],
@@ -61,7 +88,7 @@ describe('whisper', () => {
['replace[d]', 'replaced again!'],
],
},
}), 'utf-8');
});
let lastFinalizedText = '';
const onFinalize = jest.fn((text: string) => {
@@ -85,4 +112,29 @@ describe('whisper', () => {
lastFinalizedText,
).toBe('\n\nreplaced again!. This is test output. replaced again!!');
});
it.each([
{ glossary: '', expectedPrompt: '' },
{ glossary: 'test', expectedPrompt: 'Glossary: test' },
{ glossary: 'Joplin, app', expectedPrompt: 'Glossary: Joplin, app' },
// Should not include the "Glossary:" prefix if there's no translation for it
{ glossary: 'Joplin, app', expectedPrompt: 'Joplin, app', locale: 'testLocale-test' },
])('should construct a prompt from the user-specified glossary (%j)', async ({ glossary, expectedPrompt, locale }) => {
Setting.setValue('voiceTyping.glossary', glossary);
const modelDirectory = await createMockModel();
const session = await whisper.build({
modelPath: modelDirectory,
callbacks: {
onFinalize: () => {
return session.stop();
},
onPreview: jest.fn(),
},
locale: locale ?? 'en',
});
await session.start();
expect(SpeechToTextModule.testing__lastPrompt()).toBe(expectedPrompt);
});
});

View File

@@ -5,7 +5,7 @@ import { rtrimSlashes } from '@joplin/utils/path';
import { dirname, join } from 'path';
import { NativeModules } from 'react-native';
import { SpeechToTextCallbacks, VoiceTypingProvider, VoiceTypingSession } from './VoiceTyping';
import { languageCodeOnly } from '@joplin/lib/locale';
import { languageCodeOnly, stringByLocale } from '@joplin/lib/locale';
const logger = Logger.create('voiceTyping/whisper');
@@ -178,8 +178,30 @@ class Whisper implements VoiceTypingSession {
}
}
const getGlossaryPrompt = (locale: string) => {
const glossary = Setting.value('voiceTyping.glossary');
if (!glossary) return '';
// Re-define the "_" localization function so that it uses the transcription locale (as opposed to the UI locale).
const _ = (text: string) => {
return stringByLocale(locale, text);
};
let glossaryPrefix = _('Glossary:');
// Prefer no prefix if no appropriate translation of "Glossary:" is available:
if (glossaryPrefix === 'Glossary:' && languageCodeOnly(locale) !== 'en') {
glossaryPrefix = '';
}
return `${glossaryPrefix} ${glossary}`.trim();
};
const getPrompt = (locale: string, localeToPrompt: Map<string, string>) => {
return localeToPrompt.get(languageCodeOnly(locale)) ?? '';
const basePrompt = localeToPrompt.get(languageCodeOnly(locale));
return [
basePrompt,
getGlossaryPrompt(locale),
].filter(part => !!part).join(' ');
};
const modelLocalDirectory = () => {

View File

@@ -733,4 +733,4 @@ const stringByLocale = (locale: string, s: string, ...args: any[]): string => {
}
};
export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, languageCode, countryCodeOnly };
export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, stringByLocale, languageCode, countryCodeOnly };

View File

@@ -14,6 +14,11 @@ const customCssFilePath = (Setting: typeof SettingType, filename: string): strin
return `${Setting.value('rootProfileDir')}/${filename}`;
};
const showVoiceTypingSettings = () => (
// For now, iOS and web don't support voice typing.
shim.mobilePlatform() === 'android'
);
export enum CameraDirection {
Back,
Front,
@@ -1803,8 +1808,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
appTypes: [AppType.Mobile],
description: () => _('Leave it blank to download the language files from the default website'),
label: () => _('Voice typing language files (URL)'),
// For now, iOS and web don't support voice typing.
show: () => shim.mobilePlatform() === 'android',
show: showVoiceTypingSettings,
section: 'note',
},
@@ -1815,8 +1819,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
appTypes: [AppType.Mobile],
label: () => _('Preferred voice typing provider'),
isEnum: true,
// For now, iOS and web don't support voice typing.
show: () => shim.mobilePlatform() === 'android',
show: showVoiceTypingSettings,
section: 'note',
options: () => {
@@ -1827,6 +1830,17 @@ const builtInMetadata = (Setting: typeof SettingType) => {
},
},
'voiceTyping.glossary': {
value: '',
type: SettingItemType.String,
public: true,
appTypes: [AppType.Mobile],
label: () => _('Voice typing: Glossary'),
description: () => _('A comma-separated list of words. May be used for uncommon words, to help voice typing spell them correctly.'),
show: (settings) => showVoiceTypingSettings() && settings['voiceTyping.preferredProvider'].startsWith('whisper'),
section: 'note',
},
'trash.autoDeletionEnabled': {
value: true,
type: SettingItemType.Bool,

View File

@@ -10,6 +10,12 @@ By default, Joplin uses Whisper.cpp for voice typing.
Whisper.cpp provides a number of pre-trained models for transcribing speech in different languages. Both [English-only and multilingual models](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages) are available. The multilingual models support a variety of different languages. Joplin uses the smallest of the multilingual models by default.
### Preventing spelling mistakes
Joplin allows specifying a glossary for voice typing using the "Voice typing: Glossary" setting (in the "Note" section of settings). Including uncommon words in the glossary makes voice typing more likely to spell them correctly. For example, providing `Scott Joplin, ragtime.` as the glossary helps voice typing correctly spell "Scott Joplin" and "ragtime".
Internally, this is implemented using [prompting](https://cookbook.openai.com/examples/whisper_prompting_guide#pass-names-in-the-prompt-to-prevent-misspellings).
### Downloading the models
By default, Joplin downloads Whisper models from [this GitHub repository](https://github.com/joplin/voice-typing-models). It's possible to download models from a custom location by changing the **Voice typing language files (URL)** in from the "Note" tab of the configuration screen.
@@ -72,4 +78,4 @@ You can also configure the application to download the models from your own serv
* **Provide the base URL**, eg `https://example.com/models`. Then Joplin will automatically append the filename to that URL, for example it will download the French files from `https://example.com/models/fr.zip`
* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`
* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`