1
0
mirror of https://github.com/MarkParker5/STARK.git synced 2025-02-17 11:55:35 +02:00

offline realtime speech recognition

new Controls flow in progress
This commit is contained in:
MarkParker5 2021-12-04 01:27:10 +02:00
parent 9fc9105a2d
commit 585aefa5f4
25 changed files with 393 additions and 138 deletions

View File

@ -9,9 +9,9 @@ class Command(ABC):
patterns: list[Pattern]
start: Callable
def __init__(self, name, patterns = [], primary = True):
def __init__(self, name: str, patterns: list[str] = [], primary: bool = True):
self._name = name
self._patterns = patterns
self._patterns = [Pattern(pattern) for pattern in patterns]
self.primary = primary
from .CommandsManager import CommandsManager

View File

@ -13,7 +13,7 @@ class SearchResult:
class CommandsManager:
allCommands: list[Command] = []
QA: Command
QA: Command = None
def __new__(cls): # Singleton
if not hasattr(cls, 'instance'):
@ -21,15 +21,18 @@ class CommandsManager:
return cls.instance
def search(self, string: str, commands: list[Command]) -> list[SearchResult]:
string = string.lower()
results: list[SearchResult] = []
acstring = ACString(string)
acstring = ACString(string.lower())
# find command obj by pattern
for command in commands:
for pattern in command.patterns:
if groupdict := pattern.match(string):
parameters: dict[str: ACObject] = {'string': acstring,}
groupdict = pattern.match(acstring)
if groupdict != None:
parameters: dict[str: ACObject] = {'string': acstring}
for key, value in groupdict.items():
name, typeName = key.split(':')
ACType: Type[ACObject] = CommandsManager.classFromString(typeName)
@ -40,7 +43,9 @@ class CommandsManager:
results.append(SearchResult(command, parameters))
if results: return results
else: return [SearchResult(self.QA, {'string': acstring,}),]
elif qa := self.QA: return [SearchResult(qa, {'string': acstring,}),]
return []
def append(self, command):
if hasattr(self, command.name):

View File

@ -21,7 +21,9 @@ class Pattern:
# find and transform arguments like $name:Type
argumentRegex = re.compile(r'\$[:word:]:[:word:]')
while match := re.search(argumentRegex, pattern)[0]:
reMatch = re.search(argumentRegex, pattern)
while reMatch:
match = reMatch.pop(0)
arg: str = match[1:]
argName, argTypeName = arg.split(':')
argType: Type[ACObject] = classFromString(argTypeName)
@ -30,7 +32,7 @@ class Pattern:
return re.compile(pattern)
def match(self, string: str) -> Optional[dict[str, str]]:
if match := re.search(self.compiled, string):
def match(self, string: ACString) -> Optional[dict[str, str]]:
if match := re.search(self.compiled, string.value):
return match.groupdict()
return None

View File

@ -10,3 +10,7 @@ class Control(Singleton):
@abstractmethod
def start(self):
pass
@abstractmethod
def stop(self):
pass

View File

@ -1,99 +1,96 @@
#!/usr/local/bin/python3.8
from typing import Optional
import os
from ..Control import Control
from General import SpeechRecognition, Text2Speech
from ArchieCore import CommandsManager
import config
from ..Control import Control
from General import SpeechRecognizer, Text2Speech
from ArchieCore import CommandsManager, Command, Response, ResponseAction, ThreadData
'''
TODO: async
self.check_threads()
self.report()
'''
class VoiceAssistant(Control):
commandsManager = CommandsManager()
listener = SpeechRecognition.SpeechToText()
voice = Text2Speech.Engine()
threads = []
reports = []
memory = []
voids = 0
speechRecognizer = SpeechRecognizer()
voice = Text2Speech.Engine()
lastClapTime = 0
doubleClap = False
commandsContext: list[list[Command]] = []
threads: list[ThreadData] = []
reports: list[Response] = []
memory: list[Response] = []
voids: int = 0
lastClapTime: float = 0
doubleClap: bool = False
def __init__(self):
pass
def start(self):
self.listener.listen_noise()
os.system('clear')
self.commandsContext = [self.commandsManager.allCommands,]
self.speechRecognizer.didReceivePartialResult = lambda string: self.speechRecognizerReceivePartialResult(string)
self.speechRecognizer.didReceiveFinalResult = lambda string: self.speechRecognizerReceiveFinalResult(string)
self.speechRecognizer.startListening()
while True:
if self.voids >= 3:
self.voids = 0
if config.double_clap_activation:
print('\nSleep (-_-)zzZZ\n')
sleep()
def stop(self):
self.speechRecognizer.stopListening()
print('\nYou: ', end='')
speech = self.listener.listen()
print(speech.get('text') or '', end='')
def speechRecognizerReceivePartialResult(self, result: str):
print(f'\rYou: \x1B[3m{result}\x1B[0m', end = '')
while True:
if speech['status'] == 'error':
break
if speech['status'] == 'void':
self.voids += 1
break
text = speech['text']
def speechRecognizerReceiveFinalResult(self, result: str):
print(f'\rYou: {result}')
for result in self.commandsManager.search(text, self.commandsManager.allCommands):
try: response = result.command.start(result.parameters)
except: break
currentContext = self.commandsContext[0] if self.commandsContext else None
self.reply(response)
self.check_threads()
self.report()
while self.commandsContext:
if searchResults := self.commandsManager.search(string = result, commands = currentContext):
for searchResult in searchResults:
commandResponse = searchResult.command.start(params = searchResult.parameters)
self.parse(commandResponse)
if response.callback:
speech = recognize(response.callback, {})
else:
break
match commandResponse.action:
case ResponseAction.popContext:
self.commandsContext.pop(0)
case ResponseAction.popToRootContext:
self.commandsContext = [self.commandsManager.allCommands,]
break
case ResponseAction.sleep:
self.stopListening()
case ResponseAction.repeatLastAnswer:
if self.memory:
previousResponse = self.memory[-1]
self.reply(previousResponse)
break
else:
currentContext = self.commandsContext.pop(0)
else:
self.commandsContext.append(self.commandsManager.allCommands)
def recognize(self, callback, params):
print('\nYou: ', end='')
speech = self.listener.listen()
if speech['status'] in ['error', 'void']:
return speech
text = speech['text']
print(text, end='')
def parse(self, response):
self.reply(response)
if response.thread: # add background thread to list
self.threads.append(response.thread)
if response.context: # insert context if exist
self.commandsContext.insert(0, response.context)
self.memory.append(response)
while True:
self.check_threads()
if not callback: break
self.memory.insert(0, {
'text': text,
'cmd': cmd,
'response': response,
})
speech = recognize(response.callback, params)
if callback.once: break
return speech
def reply(self, response):
if response.text: # print answer
print('\nArchie: '+response.text)
if response.voice: # say answer
self.voice.generate(response.voice).speak()
def report(self):
for response in self.reports:
if response.voice:
self.voice.generate(response.voice).speak()
time.sleep(2)
self.reply(response)
self.reports = []
def reply(self, response):
if response.text: # print answer
print('\nArchie: '+response.text)
if response.voice: # say answer
self.voice.generate(response.voice).speak()
if response.thread: # add background thread to stack
self.threads.append(response.thread)
def check_threads(self):
for thread in self.threads:
if not thread['finish_event'].is_set(): continue

View File

@ -74,4 +74,5 @@ def qa_start(params):
voice = text = search or random.choice(['Не совсем понимаю, о чём вы.', 'Вот эта последняя фраза мне не ясна.', 'А вот это не совсем понятно.', 'Можете сказать то же самое другими словами?', 'Вот сейчас я совсем вас не понимаю.', 'Попробуйте выразить свою мысль по-другому',])
return Response(text = text, voice = voice)
CommandsManager.QA = qa_start
CommandsManager().QA = qa_start
print(CommandsManager().QA, 'CommandsManager Sets QA')

View File

@ -1,46 +1,55 @@
import speech_recognition as sr
from typing import Callable
import os, sys
import json
import queue
import sounddevice
import vosk
import config
#r = sr.Recognizer()
#m = sr.Microphone(device_index=config.device_index)
vosk.SetLogLevel(-1)
class SpeechToText:
def __init__(self, device = config.device_index, language = config.language_code):
self.device = device
self.language = language
self.m = sr.Microphone(device_index = self.device)
self.r = sr.Recognizer()
self.r.pause_threshold = config.pause_threshold
self.r.energy_threshold = config.energy_threshold
self.r.dynamic_energy_threshold = config.dynamic_energy_threshold
self.r.non_speaking_duration = config.non_speaking_duration
class SpeechRecognizer:
didReceivePartialResult: Callable[[str], None] = lambda self, _: None
didReceiveFinalResult: Callable[[str], None] = lambda self, _: None
def listen(self):
try:
with self.m as source:
audio = self.r.listen(source)
except:
return ''
try:
responce = {'text': self.r.recognize_google(audio, language = self.language).lower(), 'status': 'ok'}
except sr.UnknownValueError:
responce = {'text': None, 'status': 'void'}
except sr.RequestError:
responce = {'text': None, 'status': 'error'}
return responce
_isListening = False
def recognize(self, speech):
with sr.AudioFile(speech.getPath()) as source:
audio = r.record(source)
try:
return r.recognize_google(audio)
except:
return ''
audioQueue = queue.Queue()
model = vosk.Model(config.vosk_model)
def listen_noise(self):
with self.m as source:
self.r.adjust_for_ambient_noise(source)
samplerate = int(sounddevice.query_devices(kind = 'input')['default_samplerate'])
blocksize = 8000
dtype = 'int16'
channels = 1
kaldiRecognizer = vosk.KaldiRecognizer(model, samplerate)
def set_device(self, index):
self.device = 1
self.m = sr.Microphone(device_index = self.device)
def audioInputCallback(self, indata, frames, time, status):
self.audioQueue.put(bytes(indata))
def stopListening(self):
self._isListening = False
def startListening(self):
self._isListening = True
callback = lambda indata, frames, time, status: self.audioInputCallback(indata, frames, time, status)
kwargs = {
'samplerate': self.samplerate,
'blocksize': self.blocksize,
'dtype': self.dtype,
'channels': self.channels,
'callback': callback
}
with sounddevice.RawInputStream(**kwargs):
while self._isListening:
data = self.audioQueue.get()
if self.kaldiRecognizer.AcceptWaveform(data):
result = json.loads(self.kaldiRecognizer.Result())
self.didReceiveFinalResult(result['text'])
else:
result = json.loads(self.kaldiRecognizer.PartialResult())
self.didReceivePartialResult(result['partial'])

View File

@ -1 +1 @@
from .SpeechRecognition import *
from .SpeechRecognition import SpeechRecognizer

View File

@ -7,14 +7,7 @@ goole_tts_json_key = path+'google-cloud-text-to-speech-private-key.json'
db_name = 'archie.db'
language_code = 'ru-RU'
device_index = 1
voice_volume = 1
energy_threshold = 2000
dynamic_energy_threshold = True
pause_threshold = 1
non_speaking_duration = 1
vosk_model = 'model-small-rus' # from alphacephei.com/vosk/models
double_clap_activation = False

View File

@ -1,11 +1,17 @@
Python 3.10
pip install SpeechRecognition
pip install sounddevice
pip install vosk
# download model from https://alphacephei.com/vosk/models
pip install pyaudio # if instalation fails, try install from .whl (https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyaudio)
pip install google-cloud-texttospeech
pip install pygame
pip install PyTelegramBotApi
pip install bs4
pip install wikipedia
pip install xlrd
pip install xlwt
pip install xlutils
pip install pyaudio # if instalation fails, try install from .whl (https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyaudio)
pip install pip install PyTelegramBotApi

8
model-small-rus/README Normal file
View File

@ -0,0 +1,8 @@
Small Russian model for Vosk (Android, RPi, other small devices)
%WER 22.71 [ 9092 / 40042, 1124 ins, 1536 del, 6432 sub ] exp/chain_a/tdnn/decode_test_audiobooks_look_fast/wer_10_0.0
%WER 11.79 [ 5940 / 50394, 894 ins, 832 del, 4214 sub ] exp/chain_a/tdnn/decode_test_golos_crowd_look_fast/wer_11_0.0
%WER 21.34 [ 1789 / 8382, 173 ins, 440 del, 1176 sub ] exp/chain_a/tdnn/decode_test_golos_farfield_look_fast/wer_10_0.0
%WER 29.89 [ 5579 / 18666, 476 ins, 1550 del, 3553 sub ] exp/chain_a/tdnn/decode_test_sova_devices_look_fast/wer_10_0.0
%WER 31.97 [ 13588 / 42496, 1013 ins, 3640 del, 8935 sub ] exp/chain_a/tdnn/decode_test_youtube_look_fast/wer_9_0.0

Binary file not shown.

View File

@ -0,0 +1,7 @@
--sample-frequency=16000
--use-energy=false
--num-mel-bins=40
--num-ceps=40
--low-freq=20
--high-freq=7600
--allow-downsample=true

View File

@ -0,0 +1,10 @@
--min-active=200
--max-active=3000
--beam=10.0
--lattice-beam=2.0
--acoustic-scale=1.0
--frame-subsampling-factor=3
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
--endpoint.rule2.min-trailing-silence=0.5
--endpoint.rule3.min-trailing-silence=1.0
--endpoint.rule4.min-trailing-silence=2.0

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,5 @@
9855
9856
9857
9858
9859

View File

@ -0,0 +1,202 @@
1 nonword
2 begin
3 end
4 internal
5 singleton
6 nonword
7 begin
8 end
9 internal
10 singleton
11 begin
12 end
13 internal
14 singleton
15 begin
16 end
17 internal
18 singleton
19 begin
20 end
21 internal
22 singleton
23 begin
24 end
25 internal
26 singleton
27 begin
28 end
29 internal
30 singleton
31 begin
32 end
33 internal
34 singleton
35 begin
36 end
37 internal
38 singleton
39 begin
40 end
41 internal
42 singleton
43 begin
44 end
45 internal
46 singleton
47 begin
48 end
49 internal
50 singleton
51 begin
52 end
53 internal
54 singleton
55 begin
56 end
57 internal
58 singleton
59 begin
60 end
61 internal
62 singleton
63 begin
64 end
65 internal
66 singleton
67 begin
68 end
69 internal
70 singleton
71 begin
72 end
73 internal
74 singleton
75 begin
76 end
77 internal
78 singleton
79 begin
80 end
81 internal
82 singleton
83 begin
84 end
85 internal
86 singleton
87 begin
88 end
89 internal
90 singleton
91 begin
92 end
93 internal
94 singleton
95 begin
96 end
97 internal
98 singleton
99 begin
100 end
101 internal
102 singleton
103 begin
104 end
105 internal
106 singleton
107 begin
108 end
109 internal
110 singleton
111 begin
112 end
113 internal
114 singleton
115 begin
116 end
117 internal
118 singleton
119 begin
120 end
121 internal
122 singleton
123 begin
124 end
125 internal
126 singleton
127 begin
128 end
129 internal
130 singleton
131 begin
132 end
133 internal
134 singleton
135 begin
136 end
137 internal
138 singleton
139 begin
140 end
141 internal
142 singleton
143 begin
144 end
145 internal
146 singleton
147 begin
148 end
149 internal
150 singleton
151 begin
152 end
153 internal
154 singleton
155 begin
156 end
157 internal
158 singleton
159 begin
160 end
161 internal
162 singleton
163 begin
164 end
165 internal
166 singleton
167 begin
168 end
169 internal
170 singleton
171 begin
172 end
173 internal
174 singleton
175 begin
176 end
177 internal
178 singleton
179 begin
180 end
181 internal
182 singleton
183 begin
184 end
185 internal
186 singleton
187 begin
188 end
189 internal
190 singleton
191 begin
192 end
193 internal
194 singleton
195 begin
196 end
197 internal
198 singleton
199 begin
200 end
201 internal
202 singleton

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,3 @@
[
8.330133e+10 -4.600894e+09 -2.394861e+09 2.127165e+09 -9.355799e+09 -9.378007e+09 -1.302309e+10 -9.460417e+09 -9.260028e+09 -4.58608e+09 -5.287111e+09 -1.972033e+09 -6.090821e+09 -1.336419e+09 -5.214569e+09 -2.321841e+09 -3.889789e+09 -1.060202e+09 -2.065653e+09 -2.684904e+08 -7.4007e+08 -4587485 -1.315853e+08 -8597548 2.599227e+08 7.408538e+07 5.505751e+08 -1.161846e+07 5.138103e+08 -1.828159e+08 4.251498e+08 -2.901496e+07 6.469246e+08 2.489644e+08 6.289868e+08 2.490337e+08 3.38884e+08 -1.788837e+08 -2.536016e+08 -1.591728e+08 8.388078e+08
8.660994e+12 4.637783e+11 3.366465e+11 4.467952e+11 5.094759e+11 5.179353e+11 6.145244e+11 4.970492e+11 5.014889e+11 4.027981e+11 3.937422e+11 3.602942e+11 3.162307e+11 2.40687e+11 2.267307e+11 1.563018e+11 1.341105e+11 8.535779e+10 6.12398e+10 3.207774e+10 1.737325e+10 5.704115e+09 7.980573e+08 2.168777e+08 2.763352e+09 6.859176e+09 1.214891e+10 1.604714e+10 2.005353e+10 2.240119e+10 2.366007e+10 2.300222e+10 2.406182e+10 2.354406e+10 2.098983e+10 1.619869e+10 1.491578e+10 1.224871e+10 9.502735e+09 6.517532e+09 0 ]

View File

@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh

View File

@ -0,0 +1,2 @@
--left-context=3
--right-context=3

View File

@ -7,9 +7,9 @@ import Controls
def main():
controls = [
Controls.VoiceAssistant(),
Controls.TelegramBot(),
Controls.RemoteControl(),
Controls.Django(),
#Controls.TelegramBot(),
#Controls.RemoteControl(),
#Controls.Django(),
]
processes = []