mirror of
https://github.com/laurent22/joplin.git
synced 2025-01-02 12:47:41 +02:00
9f5282c8f5
Co-authored-by: Laurent Cozic <laurent22@users.noreply.github.com>
120 lines
3.8 KiB
TypeScript
120 lines
3.8 KiB
TypeScript
import Setting from '@joplin/lib/models/Setting';
|
|
import shim from '@joplin/lib/shim';
|
|
import Logger from '@joplin/utils/Logger';
|
|
import { rtrimSlashes } from '@joplin/utils/path';
|
|
import { dirname, join } from 'path';
|
|
import { NativeModules } from 'react-native';
|
|
import { SpeechToTextCallbacks, VoiceTypingProvider, VoiceTypingSession } from './VoiceTyping';
|
|
import splitWhisperText from './utils/splitWhisperText';
|
|
|
|
const logger = Logger.create('voiceTyping/whisper');
|
|
|
|
const { SpeechToTextModule } = NativeModules;
|
|
|
|
// Timestamps are in the form <|0.00|>. They seem to be added:
|
|
// - After long pauses.
|
|
// - Between sentences (in pairs).
|
|
// - At the beginning and end of a sequence.
|
|
const timestampExp = /<\|(\d+\.\d*)\|>/g;
|
|
const postProcessSpeech = (text: string) => {
|
|
return text.replace(timestampExp, '').replace(/\[BLANK_AUDIO\]/g, '');
|
|
};
|
|
|
|
class Whisper implements VoiceTypingSession {
|
|
private lastPreviewData: string;
|
|
private closeCounter = 0;
|
|
|
|
public constructor(
|
|
private sessionId: number|null,
|
|
private callbacks: SpeechToTextCallbacks,
|
|
) { }
|
|
|
|
public async start() {
|
|
if (this.sessionId === null) {
|
|
throw new Error('Session closed.');
|
|
}
|
|
try {
|
|
logger.debug('starting recorder');
|
|
await SpeechToTextModule.startRecording(this.sessionId);
|
|
logger.debug('recorder started');
|
|
|
|
const loopStartCounter = this.closeCounter;
|
|
while (this.closeCounter === loopStartCounter) {
|
|
logger.debug('reading block');
|
|
const data: string = await SpeechToTextModule.expandBufferAndConvert(this.sessionId, 4);
|
|
logger.debug('done reading block. Length', data?.length);
|
|
|
|
if (this.sessionId === null) {
|
|
logger.debug('Session stopped. Ending inference loop.');
|
|
return;
|
|
}
|
|
|
|
const recordingLength = await SpeechToTextModule.getBufferLengthSeconds(this.sessionId);
|
|
logger.debug('recording length so far', recordingLength);
|
|
const { trimTo, dataBeforeTrim, dataAfterTrim } = splitWhisperText(data, recordingLength);
|
|
|
|
if (trimTo > 2) {
|
|
logger.debug('Trim to', trimTo, 'in recording with length', recordingLength);
|
|
this.callbacks.onFinalize(postProcessSpeech(dataBeforeTrim));
|
|
this.callbacks.onPreview(postProcessSpeech(dataAfterTrim));
|
|
this.lastPreviewData = dataAfterTrim;
|
|
await SpeechToTextModule.dropFirstSeconds(this.sessionId, trimTo);
|
|
} else {
|
|
logger.debug('Preview', data);
|
|
this.lastPreviewData = data;
|
|
this.callbacks.onPreview(postProcessSpeech(data));
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error('Whisper error:', error);
|
|
this.lastPreviewData = '';
|
|
await this.stop();
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
public async stop() {
|
|
if (this.sessionId === null) {
|
|
logger.warn('Session already closed.');
|
|
return;
|
|
}
|
|
|
|
const sessionId = this.sessionId;
|
|
this.sessionId = null;
|
|
this.closeCounter ++;
|
|
await SpeechToTextModule.closeSession(sessionId);
|
|
|
|
if (this.lastPreviewData) {
|
|
this.callbacks.onFinalize(postProcessSpeech(this.lastPreviewData));
|
|
}
|
|
}
|
|
}
|
|
|
|
const modelLocalFilepath = () => {
|
|
return `${shim.fsDriver().getAppDirectoryPath()}/voice-typing-models/whisper_tiny.onnx`;
|
|
};
|
|
|
|
const whisper: VoiceTypingProvider = {
|
|
supported: () => !!SpeechToTextModule,
|
|
modelLocalFilepath: modelLocalFilepath,
|
|
getDownloadUrl: () => {
|
|
let urlTemplate = rtrimSlashes(Setting.value('voiceTypingBaseUrl').trim());
|
|
|
|
if (!urlTemplate) {
|
|
urlTemplate = 'https://github.com/personalizedrefrigerator/joplin-voice-typing-test/releases/download/test-release/{task}.zip';
|
|
}
|
|
|
|
return urlTemplate.replace(/\{task\}/g, 'whisper_tiny.onnx');
|
|
},
|
|
getUuidPath: () => {
|
|
return join(dirname(modelLocalFilepath()), 'uuid');
|
|
},
|
|
build: async ({ modelPath, callbacks, locale }) => {
|
|
const sessionId = await SpeechToTextModule.openSession(modelPath, locale);
|
|
return new Whisper(sessionId, callbacks);
|
|
},
|
|
modelName: 'whisper',
|
|
};
|
|
|
|
export default whisper;
|