You've already forked joplin
mirror of
https://github.com/laurent22/joplin.git
synced 2025-08-10 22:11:50 +02:00
Android: Voice typing: Add setting to allow specifying a glossary (#12370)
Co-authored-by: Laurent Cozic <laurent22@users.noreply.github.com>
This commit is contained in:
@@ -2,28 +2,62 @@ import { setupDatabase } from '@joplin/lib/testing/test-utils';
|
||||
import whisper from './whisper';
|
||||
import { dirname, join } from 'path';
|
||||
import { exists, mkdir, remove, writeFile } from 'fs-extra';
|
||||
import Setting from '@joplin/lib/models/Setting';
|
||||
import { NativeModules } from 'react-native';
|
||||
const SpeechToTextModule = NativeModules.SpeechToTextModule;
|
||||
|
||||
jest.mock('react-native', () => {
|
||||
const reactNative = jest.requireActual('react-native');
|
||||
|
||||
let lastPrompt: string|null = null;
|
||||
|
||||
// Set properties on reactNative rather than creating a new object with
|
||||
// {...reactNative, ...}. Creating a new object triggers deprecation warnings.
|
||||
// See https://github.com/facebook/react-native/issues/28839.
|
||||
reactNative.NativeModules.SpeechToTextModule = {
|
||||
convertNext: () => 'Test. This is test output. Test!',
|
||||
runTests: ()=> {},
|
||||
openSession: jest.fn(() => {
|
||||
openSession: jest.fn((_path, _locale, prompt) => {
|
||||
lastPrompt = prompt;
|
||||
|
||||
const someId = 1234;
|
||||
return someId;
|
||||
}),
|
||||
closeSession: jest.fn(),
|
||||
startRecording: jest.fn(),
|
||||
convertAvailable: jest.fn(() => ''),
|
||||
testing__lastPrompt: () => {
|
||||
return lastPrompt;
|
||||
},
|
||||
};
|
||||
|
||||
return reactNative;
|
||||
});
|
||||
|
||||
interface ModelConfig {
|
||||
output: {
|
||||
stringReplacements: string[][];
|
||||
regexReplacements: string[][];
|
||||
};
|
||||
}
|
||||
|
||||
const defaultModelConfig: ModelConfig = {
|
||||
output: { stringReplacements: [], regexReplacements: [] },
|
||||
};
|
||||
|
||||
const createMockModel = async (config: ModelConfig = defaultModelConfig) => {
|
||||
const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
|
||||
await mkdir(whisperBaseDirectory);
|
||||
|
||||
const modelDirectory = join(whisperBaseDirectory, 'model');
|
||||
await mkdir(modelDirectory);
|
||||
|
||||
await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
|
||||
await writeFile(join(modelDirectory, 'config.json'), JSON.stringify(config), 'utf-8');
|
||||
|
||||
return modelDirectory;
|
||||
};
|
||||
|
||||
describe('whisper', () => {
|
||||
beforeEach(async () => {
|
||||
await setupDatabase(0);
|
||||
@@ -45,14 +79,7 @@ describe('whisper', () => {
|
||||
});
|
||||
|
||||
test('should apply post-processing replacements specified in the model config', async () => {
|
||||
const whisperBaseDirectory = dirname(whisper.modelLocalFilepath('en'));
|
||||
await mkdir(whisperBaseDirectory);
|
||||
|
||||
const modelDirectory = join(whisperBaseDirectory, 'model');
|
||||
await mkdir(modelDirectory);
|
||||
|
||||
await writeFile(join(modelDirectory, 'model.bin'), 'mock model', 'utf-8');
|
||||
await writeFile(join(modelDirectory, 'config.json'), JSON.stringify({
|
||||
const modelDirectory = await createMockModel({
|
||||
output: {
|
||||
stringReplacements: [
|
||||
['Test', 'replaced'],
|
||||
@@ -61,7 +88,7 @@ describe('whisper', () => {
|
||||
['replace[d]', 'replaced again!'],
|
||||
],
|
||||
},
|
||||
}), 'utf-8');
|
||||
});
|
||||
|
||||
let lastFinalizedText = '';
|
||||
const onFinalize = jest.fn((text: string) => {
|
||||
@@ -85,4 +112,29 @@ describe('whisper', () => {
|
||||
lastFinalizedText,
|
||||
).toBe('\n\nreplaced again!. This is test output. replaced again!!');
|
||||
});
|
||||
|
||||
it.each([
|
||||
{ glossary: '', expectedPrompt: '' },
|
||||
{ glossary: 'test', expectedPrompt: 'Glossary: test' },
|
||||
{ glossary: 'Joplin, app', expectedPrompt: 'Glossary: Joplin, app' },
|
||||
// Should not include the "Glossary:" prefix if there's no translation for it
|
||||
{ glossary: 'Joplin, app', expectedPrompt: 'Joplin, app', locale: 'testLocale-test' },
|
||||
])('should construct a prompt from the user-specified glossary (%j)', async ({ glossary, expectedPrompt, locale }) => {
|
||||
Setting.setValue('voiceTyping.glossary', glossary);
|
||||
|
||||
const modelDirectory = await createMockModel();
|
||||
const session = await whisper.build({
|
||||
modelPath: modelDirectory,
|
||||
callbacks: {
|
||||
onFinalize: () => {
|
||||
return session.stop();
|
||||
},
|
||||
onPreview: jest.fn(),
|
||||
},
|
||||
locale: locale ?? 'en',
|
||||
});
|
||||
await session.start();
|
||||
|
||||
expect(SpeechToTextModule.testing__lastPrompt()).toBe(expectedPrompt);
|
||||
});
|
||||
});
|
||||
|
@@ -5,7 +5,7 @@ import { rtrimSlashes } from '@joplin/utils/path';
|
||||
import { dirname, join } from 'path';
|
||||
import { NativeModules } from 'react-native';
|
||||
import { SpeechToTextCallbacks, VoiceTypingProvider, VoiceTypingSession } from './VoiceTyping';
|
||||
import { languageCodeOnly } from '@joplin/lib/locale';
|
||||
import { languageCodeOnly, stringByLocale } from '@joplin/lib/locale';
|
||||
|
||||
const logger = Logger.create('voiceTyping/whisper');
|
||||
|
||||
@@ -178,8 +178,30 @@ class Whisper implements VoiceTypingSession {
|
||||
}
|
||||
}
|
||||
|
||||
const getGlossaryPrompt = (locale: string) => {
|
||||
const glossary = Setting.value('voiceTyping.glossary');
|
||||
if (!glossary) return '';
|
||||
|
||||
// Re-define the "_" localization function so that it uses the transcription locale (as opposed to the UI locale).
|
||||
const _ = (text: string) => {
|
||||
return stringByLocale(locale, text);
|
||||
};
|
||||
let glossaryPrefix = _('Glossary:');
|
||||
|
||||
// Prefer no prefix if no appropriate translation of "Glossary:" is available:
|
||||
if (glossaryPrefix === 'Glossary:' && languageCodeOnly(locale) !== 'en') {
|
||||
glossaryPrefix = '';
|
||||
}
|
||||
|
||||
return `${glossaryPrefix} ${glossary}`.trim();
|
||||
};
|
||||
|
||||
const getPrompt = (locale: string, localeToPrompt: Map<string, string>) => {
|
||||
return localeToPrompt.get(languageCodeOnly(locale)) ?? '';
|
||||
const basePrompt = localeToPrompt.get(languageCodeOnly(locale));
|
||||
return [
|
||||
basePrompt,
|
||||
getGlossaryPrompt(locale),
|
||||
].filter(part => !!part).join(' ');
|
||||
};
|
||||
|
||||
const modelLocalDirectory = () => {
|
||||
|
@@ -733,4 +733,4 @@ const stringByLocale = (locale: string, s: string, ...args: any[]): string => {
|
||||
}
|
||||
};
|
||||
|
||||
export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, languageCode, countryCodeOnly };
|
||||
export { _, _n, supportedLocales, languageName, currentLocale, localesFromLanguageCode, languageCodeOnly, countryDisplayName, localeStrings, setLocale, supportedLocalesToLanguages, defaultLocale, closestSupportedLocale, stringByLocale, languageCode, countryCodeOnly };
|
||||
|
@@ -14,6 +14,11 @@ const customCssFilePath = (Setting: typeof SettingType, filename: string): strin
|
||||
return `${Setting.value('rootProfileDir')}/${filename}`;
|
||||
};
|
||||
|
||||
const showVoiceTypingSettings = () => (
|
||||
// For now, iOS and web don't support voice typing.
|
||||
shim.mobilePlatform() === 'android'
|
||||
);
|
||||
|
||||
export enum CameraDirection {
|
||||
Back,
|
||||
Front,
|
||||
@@ -1803,8 +1808,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
|
||||
appTypes: [AppType.Mobile],
|
||||
description: () => _('Leave it blank to download the language files from the default website'),
|
||||
label: () => _('Voice typing language files (URL)'),
|
||||
// For now, iOS and web don't support voice typing.
|
||||
show: () => shim.mobilePlatform() === 'android',
|
||||
show: showVoiceTypingSettings,
|
||||
section: 'note',
|
||||
},
|
||||
|
||||
@@ -1815,8 +1819,7 @@ const builtInMetadata = (Setting: typeof SettingType) => {
|
||||
appTypes: [AppType.Mobile],
|
||||
label: () => _('Preferred voice typing provider'),
|
||||
isEnum: true,
|
||||
// For now, iOS and web don't support voice typing.
|
||||
show: () => shim.mobilePlatform() === 'android',
|
||||
show: showVoiceTypingSettings,
|
||||
section: 'note',
|
||||
|
||||
options: () => {
|
||||
@@ -1827,6 +1830,17 @@ const builtInMetadata = (Setting: typeof SettingType) => {
|
||||
},
|
||||
},
|
||||
|
||||
'voiceTyping.glossary': {
|
||||
value: '',
|
||||
type: SettingItemType.String,
|
||||
public: true,
|
||||
appTypes: [AppType.Mobile],
|
||||
label: () => _('Voice typing: Glossary'),
|
||||
description: () => _('A comma-separated list of words. May be used for uncommon words, to help voice typing spell them correctly.'),
|
||||
show: (settings) => showVoiceTypingSettings() && settings['voiceTyping.preferredProvider'].startsWith('whisper'),
|
||||
section: 'note',
|
||||
},
|
||||
|
||||
'trash.autoDeletionEnabled': {
|
||||
value: true,
|
||||
type: SettingItemType.Bool,
|
||||
|
@@ -10,6 +10,12 @@ By default, Joplin uses Whisper.cpp for voice typing.
|
||||
|
||||
Whisper.cpp provides a number of pre-trained models for transcribing speech in different languages. Both [English-only and multilingual models](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages) are available. The multilingual models support a variety of different languages. Joplin uses the smallest of the multilingual models by default.
|
||||
|
||||
### Preventing spelling mistakes
|
||||
|
||||
Joplin allows specifying a glossary for voice typing using the "Voice typing: Glossary" setting (in the "Note" section of settings). Including uncommon words in the glossary makes voice typing more likely to spell them correctly. For example, providing `Scott Joplin, ragtime.` as the glossary helps voice typing correctly spell "Scott Joplin" and "ragtime".
|
||||
|
||||
Internally, this is implemented using [prompting](https://cookbook.openai.com/examples/whisper_prompting_guide#pass-names-in-the-prompt-to-prevent-misspellings).
|
||||
|
||||
### Downloading the models
|
||||
|
||||
By default, Joplin downloads Whisper models from [this GitHub repository](https://github.com/joplin/voice-typing-models). It's possible to download models from a custom location by changing the **Voice typing language files (URL)** in from the "Note" tab of the configuration screen.
|
||||
@@ -72,4 +78,4 @@ You can also configure the application to download the models from your own serv
|
||||
|
||||
* **Provide the base URL**, eg `https://example.com/models`. Then Joplin will automatically append the filename to that URL, for example it will download the French files from `https://example.com/models/fr.zip`
|
||||
|
||||
* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`
|
||||
* **Provide a URL template**. In that case, include a `{lang}` variable, which will be expanded to the language code. For example, if the URL is set to `https://example.com/models/vosk-model-{lang}.zip`, the app will download the French file from `https://example.com/models/vosk-model-fr.zip`. With this option you have more flexibility on where the app should get the file from. For example you can also use query parameters, as in `https://example.com/models/vosk-models.php?lang={lang}&download=true`
|
||||
|
Reference in New Issue
Block a user