offline realtime speech recognition

new Controls flow in progress
2025-02-17 11:55:35 +02:00 · 2021-12-04 01:27:10 +02:00 · 2021-12-04 01:27:10 +02:00 · 585aefa5f4
commit 585aefa5f4
parent 9fc9105a2d
25 changed files with 393 additions and 138 deletions
--- a/ArchieCore/Commands/Command.py
+++ b/ArchieCore/Commands/Command.py
@ -9,9 +9,9 @@ class Command(ABC):
    patterns: list[Pattern]
    start: Callable

-    def __init__(self, name, patterns = [], primary = True):
+    def __init__(self, name: str, patterns: list[str] = [], primary: bool = True):
        self._name = name
-        self._patterns = patterns
+        self._patterns = [Pattern(pattern) for pattern in patterns]
        self.primary = primary

        from .CommandsManager import CommandsManager
--- a/ArchieCore/Commands/CommandsManager.py
+++ b/ArchieCore/Commands/CommandsManager.py
@ -13,7 +13,7 @@ class SearchResult:

 class CommandsManager:
    allCommands: list[Command] = []
-    QA: Command
+    QA: Command = None

    def __new__(cls):                                                           # Singleton
        if not hasattr(cls, 'instance'):
@ -21,15 +21,18 @@ class CommandsManager:
        return cls.instance

    def search(self, string: str, commands: list[Command]) -> list[SearchResult]:
-        string = string.lower()
        results: list[SearchResult] = []
-        acstring = ACString(string)
+        acstring = ACString(string.lower())

        #   find command obj by pattern
        for command in commands:
            for pattern in command.patterns:
-                if groupdict := pattern.match(string):
-                    parameters: dict[str: ACObject] = {'string': acstring,}
+                groupdict = pattern.match(acstring)
+
+                if groupdict != None:
+
+                    parameters: dict[str: ACObject] = {'string': acstring}
+
                    for key, value in groupdict.items():
                        name, typeName = key.split(':')
                        ACType: Type[ACObject] = CommandsManager.classFromString(typeName)
@ -40,7 +43,9 @@ class CommandsManager:
                        results.append(SearchResult(command, parameters))

        if results: return results
-        else: return [SearchResult(self.QA, {'string': acstring,}),]
+        elif qa := self.QA: return [SearchResult(qa, {'string': acstring,}),]
+
+        return []

    def append(self, command):
        if hasattr(self, command.name):
--- a/ArchieCore/Pattern/Pattern.py
+++ b/ArchieCore/Pattern/Pattern.py
@ -21,7 +21,9 @@ class Pattern:

        #   find and transform arguments like $name:Type
        argumentRegex = re.compile(r'\$[:word:]:[:word:]')
-        while match := re.search(argumentRegex, pattern)[0]:
+        reMatch = re.search(argumentRegex, pattern)
+        while reMatch:
+            match = reMatch.pop(0)
            arg: str = match[1:]
            argName, argTypeName = arg.split(':')
            argType: Type[ACObject] = classFromString(argTypeName)
@ -30,7 +32,7 @@ class Pattern:
        return re.compile(pattern)


-    def match(self, string: str) -> Optional[dict[str, str]]:
-        if match := re.search(self.compiled, string):
+    def match(self, string: ACString) -> Optional[dict[str, str]]:
+        if match := re.search(self.compiled, string.value):
            return match.groupdict()
        return None
--- a/Controls/Control.py
+++ b/Controls/Control.py
@ -10,3 +10,7 @@ class Control(Singleton):
    @abstractmethod
    def start(self):
        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
--- a/Controls/VoiceAssistant/VoiceAssistant.py
+++ b/Controls/VoiceAssistant/VoiceAssistant.py
@ -1,99 +1,96 @@
 #!/usr/local/bin/python3.8
+from typing import Optional
+
 import os
-from ..Control import Control
-from General import SpeechRecognition, Text2Speech
-from ArchieCore import CommandsManager
+
 import config
+from ..Control import Control
+from General import SpeechRecognizer, Text2Speech
+from ArchieCore import CommandsManager, Command, Response, ResponseAction, ThreadData
+
+'''
+TODO: async
+self.check_threads()
+self.report()
+'''

 class VoiceAssistant(Control):
    commandsManager = CommandsManager()
-    listener = SpeechRecognition.SpeechToText()
-    voice    = Text2Speech.Engine()
-    threads  = []
-    reports  = []
-    memory   = []
-    voids    = 0
+    speechRecognizer = SpeechRecognizer()
+    voice = Text2Speech.Engine()

-    lastClapTime = 0
-    doubleClap = False
+    commandsContext: list[list[Command]] = []
+    threads: list[ThreadData] = []
+    reports: list[Response] = []
+    memory: list[Response] = []
+
+    voids: int = 0
+    lastClapTime: float = 0
+    doubleClap: bool = False

    def __init__(self):
        pass

    def start(self):
-        self.listener.listen_noise()
-        os.system('clear')
+        self.commandsContext = [self.commandsManager.allCommands,]
+        self.speechRecognizer.didReceivePartialResult = lambda string: self.speechRecognizerReceivePartialResult(string)
+        self.speechRecognizer.didReceiveFinalResult = lambda string: self.speechRecognizerReceiveFinalResult(string)
+        self.speechRecognizer.startListening()

-        while True:
-            if self.voids >= 3:
-                self.voids = 0
-                if config.double_clap_activation:
-                    print('\nSleep (-_-)zzZZ\n')
-                    sleep()
+    def stop(self):
+        self.speechRecognizer.stopListening()

-            print('\nYou: ', end='')
-            speech = self.listener.listen()
-            print(speech.get('text') or '', end='')
+    def speechRecognizerReceivePartialResult(self, result: str):
+        print(f'\rYou: \x1B[3m{result}\x1B[0m', end = '')

-            while True:
-                if speech['status'] == 'error':
-                    break
-                if speech['status'] == 'void':
-                    self.voids += 1
-                    break
-                text = speech['text']
+    def speechRecognizerReceiveFinalResult(self, result: str):
+        print(f'\rYou: {result}')

-                for result in self.commandsManager.search(text, self.commandsManager.allCommands):
-                    try: response = result.command.start(result.parameters)
-                    except: break
+        currentContext = self.commandsContext[0] if self.commandsContext else None

-                    self.reply(response)
-                    self.check_threads()
-                    self.report()
+        while self.commandsContext:
+            if searchResults := self.commandsManager.search(string = result, commands = currentContext):
+                for searchResult in searchResults:
+                    commandResponse = searchResult.command.start(params = searchResult.parameters)
+                    self.parse(commandResponse)

-                    if response.callback:
-                        speech = recognize(response.callback, {})
-                    else:
-                        break
+                    match commandResponse.action:
+                        case ResponseAction.popContext:
+                            self.commandsContext.pop(0)
+                        case ResponseAction.popToRootContext:
+                            self.commandsContext = [self.commandsManager.allCommands,]
+                            break
+                        case ResponseAction.sleep:
+                            self.stopListening()
+                        case ResponseAction.repeatLastAnswer:
+                            if self.memory:
+                                previousResponse = self.memory[-1]
+                                self.reply(previousResponse)
+                break
+            else:
+                currentContext = self.commandsContext.pop(0)
+        else:
+            self.commandsContext.append(self.commandsManager.allCommands)

-    def recognize(self, callback, params):
-        print('\nYou: ', end='')
-        speech = self.listener.listen()
-        if speech['status'] in ['error', 'void']:
-            return speech
-        text = speech['text']
-        print(text, end='')
+    def parse(self, response):
+        self.reply(response)
+        if response.thread:                               #   add background thread to list
+            self.threads.append(response.thread)
+        if response.context:                              #   insert context if exist
+            self.commandsContext.insert(0, response.context)
+        self.memory.append(response)

-        while True:
-            self.check_threads()
-            if not callback: break
-
-            self.memory.insert(0, {
-                'text': text,
-                'cmd':  cmd,
-                'response': response,
-            })
-
-            speech = recognize(response.callback, params)
-            if callback.once: break
-
-        return speech
+    def reply(self, response):
+        if response.text:                                 #   print answer
+            print('\nArchie: '+response.text)
+        if response.voice:                                #   say answer
+            self.voice.generate(response.voice).speak()

    def report(self):
        for response in self.reports:
-            if response.voice:
-                self.voice.generate(response.voice).speak()
-            time.sleep(2)
+            self.reply(response)
        self.reports = []

-    def reply(self, response):
-        if response.text:                                #   print answer
-            print('\nArchie: '+response.text)
-        if response.voice:                               #   say answer
-            self.voice.generate(response.voice).speak()
-        if response.thread:                              #   add background thread to stack
-            self.threads.append(response.thread)
-
    def check_threads(self):
        for thread in self.threads:
            if not thread['finish_event'].is_set(): continue
--- a/Features/QA/QA.py
+++ b/Features/QA/QA.py
@ -74,4 +74,5 @@ def qa_start(params):
        voice = text = search or random.choice(['Не совсем понимаю, о чём вы.', 'Вот эта последняя фраза мне не ясна.', 'А вот это не совсем понятно.', 'Можете сказать то же самое другими словами?', 'Вот сейчас я совсем вас не понимаю.', 'Попробуйте выразить свою мысль по-другому',])
    return Response(text = text, voice = voice)

-CommandsManager.QA = qa_start
+CommandsManager().QA = qa_start
+print(CommandsManager().QA, 'CommandsManager Sets QA')
--- a/General/SpeechRecognition/SpeechRecognition.py
+++ b/General/SpeechRecognition/SpeechRecognition.py
@ -1,46 +1,55 @@
-import speech_recognition as sr
+from typing import Callable
+import os, sys
+import json
+import queue
+
+import sounddevice
+import vosk
+
 import config

-#r = sr.Recognizer()
-#m = sr.Microphone(device_index=config.device_index)
+vosk.SetLogLevel(-1)

-class SpeechToText:
-    def __init__(self, device = config.device_index, language = config.language_code):
-        self.device     = device
-        self.language   = language
-        self.m          = sr.Microphone(device_index = self.device)
-        self.r          = sr.Recognizer()
-        self.r.pause_threshold          = config.pause_threshold
-        self.r.energy_threshold         = config.energy_threshold
-        self.r.dynamic_energy_threshold = config.dynamic_energy_threshold
-        self.r.non_speaking_duration    = config.non_speaking_duration
+class SpeechRecognizer:
+    didReceivePartialResult: Callable[[str], None] = lambda self, _: None
+    didReceiveFinalResult: Callable[[str], None] = lambda self, _: None

-    def listen(self):
-        try:
-            with self.m as source:
-                audio = self.r.listen(source)
-        except:
-            return ''
-        try:
-            responce = {'text': self.r.recognize_google(audio, language = self.language).lower(), 'status': 'ok'}
-        except sr.UnknownValueError:
-            responce = {'text': None, 'status': 'void'}
-        except sr.RequestError:
-            responce = {'text': None, 'status': 'error'}
-        return responce
+    _isListening = False

-    def recognize(self, speech):
-        with sr.AudioFile(speech.getPath()) as source:
-            audio = r.record(source)
-        try:
-            return r.recognize_google(audio)
-        except:
-            return ''
+    audioQueue = queue.Queue()
+    model = vosk.Model(config.vosk_model)

-    def listen_noise(self):
-        with self.m as source:
-            self.r.adjust_for_ambient_noise(source)
+    samplerate = int(sounddevice.query_devices(kind = 'input')['default_samplerate'])
+    blocksize = 8000
+    dtype = 'int16'
+    channels = 1
+    kaldiRecognizer = vosk.KaldiRecognizer(model, samplerate)

-    def set_device(self, index):
-        self.device = 1
-        self.m = sr.Microphone(device_index = self.device)
+    def audioInputCallback(self, indata, frames, time, status):
+        self.audioQueue.put(bytes(indata))
+
+    def stopListening(self):
+        self._isListening = False
+
+    def startListening(self):
+        self._isListening = True
+
+        callback = lambda indata, frames, time, status: self.audioInputCallback(indata, frames, time, status)
+        kwargs = {
+            'samplerate': self.samplerate,
+            'blocksize': self.blocksize,
+            'dtype': self.dtype,
+            'channels': self.channels,
+            'callback': callback
+        }
+
+        with sounddevice.RawInputStream(**kwargs):
+            while self._isListening:
+                data = self.audioQueue.get()
+
+                if self.kaldiRecognizer.AcceptWaveform(data):
+                    result = json.loads(self.kaldiRecognizer.Result())
+                    self.didReceiveFinalResult(result['text'])
+                else:
+                    result = json.loads(self.kaldiRecognizer.PartialResult())
+                    self.didReceivePartialResult(result['partial'])
--- a/General/SpeechRecognition/init.py
+++ b/General/SpeechRecognition/init.py
@ -1 +1 @@
-from .SpeechRecognition import *
+from .SpeechRecognition import SpeechRecognizer
--- a/config.example.py
+++ b/config.example.py
@ -7,14 +7,7 @@ goole_tts_json_key = path+'google-cloud-text-to-speech-private-key.json'

 db_name = 'archie.db'

-language_code = 'ru-RU'
-device_index  = 1
-voice_volume  = 1
-
-energy_threshold = 2000
-dynamic_energy_threshold = True
-pause_threshold = 1
-non_speaking_duration = 1
+vosk_model = 'model-small-rus' # from alphacephei.com/vosk/models

 double_clap_activation = False

--- a/dependences.txt
+++ b/dependences.txt
@ -1,11 +1,17 @@
 Python 3.10
-pip install SpeechRecognition
+
+pip install sounddevice
+pip install vosk
+# download model from https://alphacephei.com/vosk/models
+
+pip install pyaudio # if instalation fails, try install from .whl (https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyaudio)
 pip install google-cloud-texttospeech
-pip install pygame
+
+pip install PyTelegramBotApi
+
 pip install bs4
 pip install wikipedia
+
 pip install xlrd
 pip install xlwt
 pip install xlutils
-pip install pyaudio # if instalation fails, try install from .whl (https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyaudio)
-pip install pip install PyTelegramBotApi
--- a/model-small-rus/README
+++ b/model-small-rus/README
@ -0,0 +1,8 @@
+Small Russian model for Vosk (Android, RPi, other small devices)
+
+%WER 22.71 [ 9092 / 40042, 1124 ins, 1536 del, 6432 sub ] exp/chain_a/tdnn/decode_test_audiobooks_look_fast/wer_10_0.0
+%WER 11.79 [ 5940 / 50394, 894 ins, 832 del, 4214 sub ] exp/chain_a/tdnn/decode_test_golos_crowd_look_fast/wer_11_0.0
+%WER 21.34 [ 1789 / 8382, 173 ins, 440 del, 1176 sub ] exp/chain_a/tdnn/decode_test_golos_farfield_look_fast/wer_10_0.0
+%WER 29.89 [ 5579 / 18666, 476 ins, 1550 del, 3553 sub ] exp/chain_a/tdnn/decode_test_sova_devices_look_fast/wer_10_0.0
+%WER 31.97 [ 13588 / 42496, 1013 ins, 3640 del, 8935 sub ] exp/chain_a/tdnn/decode_test_youtube_look_fast/wer_9_0.0
+
--- a/model-small-rus/am/final.mdl
+++ b/model-small-rus/am/final.mdl
--- a/model-small-rus/conf/mfcc.conf
+++ b/model-small-rus/conf/mfcc.conf
@ -0,0 +1,7 @@
+--sample-frequency=16000
+--use-energy=false
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=20
+--high-freq=7600
+--allow-downsample=true
--- a/model-small-rus/conf/model.conf
+++ b/model-small-rus/conf/model.conf
@ -0,0 +1,10 @@
+--min-active=200
+--max-active=3000
+--beam=10.0
+--lattice-beam=2.0
+--acoustic-scale=1.0
+--frame-subsampling-factor=3
+--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
+--endpoint.rule2.min-trailing-silence=0.5
+--endpoint.rule3.min-trailing-silence=1.0
+--endpoint.rule4.min-trailing-silence=2.0
--- a/model-small-rus/graph/Gr.fst
+++ b/model-small-rus/graph/Gr.fst
--- a/model-small-rus/graph/HCLr.fst
+++ b/model-small-rus/graph/HCLr.fst
--- a/model-small-rus/graph/disambig_tid.int
+++ b/model-small-rus/graph/disambig_tid.int
@ -0,0 +1,5 @@
+9855
+9856
+9857
+9858
+9859
--- a/model-small-rus/graph/phones/word_boundary.int
+++ b/model-small-rus/graph/phones/word_boundary.int
@ -0,0 +1,202 @@
+1 nonword
+2 begin
+3 end
+4 internal
+5 singleton
+6 nonword
+7 begin
+8 end
+9 internal
+10 singleton
+11 begin
+12 end
+13 internal
+14 singleton
+15 begin
+16 end
+17 internal
+18 singleton
+19 begin
+20 end
+21 internal
+22 singleton
+23 begin
+24 end
+25 internal
+26 singleton
+27 begin
+28 end
+29 internal
+30 singleton
+31 begin
+32 end
+33 internal
+34 singleton
+35 begin
+36 end
+37 internal
+38 singleton
+39 begin
+40 end
+41 internal
+42 singleton
+43 begin
+44 end
+45 internal
+46 singleton
+47 begin
+48 end
+49 internal
+50 singleton
+51 begin
+52 end
+53 internal
+54 singleton
+55 begin
+56 end
+57 internal
+58 singleton
+59 begin
+60 end
+61 internal
+62 singleton
+63 begin
+64 end
+65 internal
+66 singleton
+67 begin
+68 end
+69 internal
+70 singleton
+71 begin
+72 end
+73 internal
+74 singleton
+75 begin
+76 end
+77 internal
+78 singleton
+79 begin
+80 end
+81 internal
+82 singleton
+83 begin
+84 end
+85 internal
+86 singleton
+87 begin
+88 end
+89 internal
+90 singleton
+91 begin
+92 end
+93 internal
+94 singleton
+95 begin
+96 end
+97 internal
+98 singleton
+99 begin
+100 end
+101 internal
+102 singleton
+103 begin
+104 end
+105 internal
+106 singleton
+107 begin
+108 end
+109 internal
+110 singleton
+111 begin
+112 end
+113 internal
+114 singleton
+115 begin
+116 end
+117 internal
+118 singleton
+119 begin
+120 end
+121 internal
+122 singleton
+123 begin
+124 end
+125 internal
+126 singleton
+127 begin
+128 end
+129 internal
+130 singleton
+131 begin
+132 end
+133 internal
+134 singleton
+135 begin
+136 end
+137 internal
+138 singleton
+139 begin
+140 end
+141 internal
+142 singleton
+143 begin
+144 end
+145 internal
+146 singleton
+147 begin
+148 end
+149 internal
+150 singleton
+151 begin
+152 end
+153 internal
+154 singleton
+155 begin
+156 end
+157 internal
+158 singleton
+159 begin
+160 end
+161 internal
+162 singleton
+163 begin
+164 end
+165 internal
+166 singleton
+167 begin
+168 end
+169 internal
+170 singleton
+171 begin
+172 end
+173 internal
+174 singleton
+175 begin
+176 end
+177 internal
+178 singleton
+179 begin
+180 end
+181 internal
+182 singleton
+183 begin
+184 end
+185 internal
+186 singleton
+187 begin
+188 end
+189 internal
+190 singleton
+191 begin
+192 end
+193 internal
+194 singleton
+195 begin
+196 end
+197 internal
+198 singleton
+199 begin
+200 end
+201 internal
+202 singleton
--- a/model-small-rus/ivector/final.dubm
+++ b/model-small-rus/ivector/final.dubm
--- a/model-small-rus/ivector/final.ie
+++ b/model-small-rus/ivector/final.ie
--- a/model-small-rus/ivector/final.mat
+++ b/model-small-rus/ivector/final.mat
--- a/model-small-rus/ivector/global_cmvn.stats
+++ b/model-small-rus/ivector/global_cmvn.stats
@ -0,0 +1,3 @@
+ [
+  8.330133e+10 -4.600894e+09 -2.394861e+09 2.127165e+09 -9.355799e+09 -9.378007e+09 -1.302309e+10 -9.460417e+09 -9.260028e+09 -4.58608e+09 -5.287111e+09 -1.972033e+09 -6.090821e+09 -1.336419e+09 -5.214569e+09 -2.321841e+09 -3.889789e+09 -1.060202e+09 -2.065653e+09 -2.684904e+08 -7.4007e+08 -4587485 -1.315853e+08 -8597548 2.599227e+08 7.408538e+07 5.505751e+08 -1.161846e+07 5.138103e+08 -1.828159e+08 4.251498e+08 -2.901496e+07 6.469246e+08 2.489644e+08 6.289868e+08 2.490337e+08 3.38884e+08 -1.788837e+08 -2.536016e+08 -1.591728e+08 8.388078e+08 
+  8.660994e+12 4.637783e+11 3.366465e+11 4.467952e+11 5.094759e+11 5.179353e+11 6.145244e+11 4.970492e+11 5.014889e+11 4.027981e+11 3.937422e+11 3.602942e+11 3.162307e+11 2.40687e+11 2.267307e+11 1.563018e+11 1.341105e+11 8.535779e+10 6.12398e+10 3.207774e+10 1.737325e+10 5.704115e+09 7.980573e+08 2.168777e+08 2.763352e+09 6.859176e+09 1.214891e+10 1.604714e+10 2.005353e+10 2.240119e+10 2.366007e+10 2.300222e+10 2.406182e+10 2.354406e+10 2.098983e+10 1.619869e+10 1.491578e+10 1.224871e+10 9.502735e+09 6.517532e+09 0 ]
--- a/model-small-rus/ivector/online_cmvn.conf
+++ b/model-small-rus/ivector/online_cmvn.conf
@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
--- a/model-small-rus/ivector/splice.conf
+++ b/model-small-rus/ivector/splice.conf
@ -0,0 +1,2 @@
+--left-context=3
+--right-context=3
--- a/start.py
+++ b/start.py
@ -7,9 +7,9 @@ import Controls
 def main():
    controls = [
        Controls.VoiceAssistant(),
-        Controls.TelegramBot(),
-        Controls.RemoteControl(),
-        Controls.Django(),
+        #Controls.TelegramBot(),
+        #Controls.RemoteControl(),
+        #Controls.Django(),
    ]

    processes = []
				`@ -0,0 +1 @@`
				`# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh`