You've already forked Irene-Voice-Assistant
mirror of
https://github.com/janvarev/Irene-Voice-Assistant.git
synced 2025-11-23 22:45:08 +02:00
11.1.0 Добавлен плагин нормализации prepare от @Grayen-mail, который умеет обрабатывать допсимволы и английский текст
Он требует библиотеку eng_to_ipa, которая была добавлена в проект. Плагин нормализации prepare теперь установлен по умолчанию при настройки нормализации "default"
This commit is contained in:
@@ -358,6 +358,8 @@ https://github.com/Oknolaz/vasisualy
|
|||||||
|
|
||||||
AlphaCephei за прекрасную библиотеку распознавания Vosk ( https://alphacephei.com/vosk/index.ru )
|
AlphaCephei за прекрасную библиотеку распознавания Vosk ( https://alphacephei.com/vosk/index.ru )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Поддержка проекта
|
## Поддержка проекта
|
||||||
|
|
||||||
Основная сложность в опенсорс - это не писать код. Писать код интересно.
|
Основная сложность в опенсорс - это не писать код. Писать код интересно.
|
||||||
|
|||||||
21
eng_to_ipa/LICENSE
Normal file
21
eng_to_ipa/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2018 Michael Phillips
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
6
eng_to_ipa/__init__.py
Normal file
6
eng_to_ipa/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from .transcribe import *
|
||||||
|
from .stress import *
|
||||||
|
from .rhymes import *
|
||||||
|
from .syllables import *
|
||||||
|
|
||||||
|
__all__ = ['transcribe', 'rhymes', 'stress', 'syllables']
|
||||||
BIN
eng_to_ipa/resources/CMU_dict.db
Normal file
BIN
eng_to_ipa/resources/CMU_dict.db
Normal file
Binary file not shown.
1
eng_to_ipa/resources/CMU_dict.json
Normal file
1
eng_to_ipa/resources/CMU_dict.json
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,39 @@
|
|||||||
|
AA vowel
|
||||||
|
AE vowel
|
||||||
|
AH vowel
|
||||||
|
AO vowel
|
||||||
|
AW vowel
|
||||||
|
AY vowel
|
||||||
|
B stop
|
||||||
|
CH affricate
|
||||||
|
D stop
|
||||||
|
DH fricative
|
||||||
|
EH vowel
|
||||||
|
ER vowel
|
||||||
|
EY vowel
|
||||||
|
F fricative
|
||||||
|
G stop
|
||||||
|
HH aspirate
|
||||||
|
IH vowel
|
||||||
|
IY vowel
|
||||||
|
JH affricate
|
||||||
|
K stop
|
||||||
|
L liquid
|
||||||
|
M nasal
|
||||||
|
N nasal
|
||||||
|
NG nasal
|
||||||
|
OW vowel
|
||||||
|
OY vowel
|
||||||
|
P stop
|
||||||
|
R liquid
|
||||||
|
S fricative
|
||||||
|
SH fricative
|
||||||
|
T stop
|
||||||
|
TH fricative
|
||||||
|
UH vowel
|
||||||
|
UW vowel
|
||||||
|
V fricative
|
||||||
|
W semivowel
|
||||||
|
Y semivowel
|
||||||
|
Z fricative
|
||||||
|
ZH fricative
|
||||||
133779
eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.txt
Normal file
133779
eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.txt
Normal file
File diff suppressed because it is too large
Load Diff
1
eng_to_ipa/resources/phones.json
Normal file
1
eng_to_ipa/resources/phones.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"aa": "vowel", "ae": "vowel", "ah": "vowel", "q": "vowel", "+": "vowel", "ao": "vowel", "aw": "vowel", "ay": "vowel", "b": "stop", "ch": "affricate", "d": "stop", "dh": "fricative", "eh": "vowel", "er": "vowel", "ey": "vowel", "f": "fricative", "g": "stop", "hh": "aspirate", "ih": "vowel", "iy": "vowel", "jh": "affricate", "k": "stop", "l": "liquid", "m": "nasal", "n": "nasal", "ng": "nasal", "ow": "vowel", "oy": "vowel", "p": "stop", "r": "liquid", "s": "fricative", "sh": "fricative", "t": "stop", "th": "fricative", "uh": "vowel", "uw": "vowel", "v": "fricative", "w": "semivowel", "y": "semivowel", "z": "fricative", "zh": "fricative"}
|
||||||
42
eng_to_ipa/rhymes.py
Normal file
42
eng_to_ipa/rhymes.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# Simple rhyming support. Call get_rhymes() on a word to find rhymes from the CMU dictionary.
|
||||||
|
from eng_to_ipa.transcribe import ModeType, get_cmu, preprocess
|
||||||
|
|
||||||
|
|
||||||
|
def remove_onset(word_in):
|
||||||
|
phone_list = get_cmu([word_in])[0][0].split(" ")
|
||||||
|
for i, phoneme in enumerate(phone_list):
|
||||||
|
if "1" in phoneme:
|
||||||
|
return ' '.join(phone_list[i:])
|
||||||
|
|
||||||
|
|
||||||
|
def get_rhymes(word, mode="sql"):
|
||||||
|
if len(word.split()) > 1:
|
||||||
|
return [get_rhymes(w) for w in word.split()]
|
||||||
|
phones = remove_onset(preprocess(word))
|
||||||
|
phones_full = get_cmu([preprocess(word)])[0][0]
|
||||||
|
asset = ModeType(mode=mode).mode
|
||||||
|
if mode == "sql":
|
||||||
|
asset.execute("SELECT word, phonemes FROM dictionary WHERE phonemes "
|
||||||
|
"LIKE \"%{0}\" AND NOT word=\"{1}\" ".format(phones, word) +
|
||||||
|
"AND NOT phonemes=\"{0}\"".format(phones_full))
|
||||||
|
# also don't return results that are the same but spelled differently
|
||||||
|
return sorted(list(set([r[0] for r in asset.fetchall()])))
|
||||||
|
elif mode == "json":
|
||||||
|
r_list = []
|
||||||
|
for key, val in asset.items():
|
||||||
|
for v in val:
|
||||||
|
if v.endswith(phones) and word != key and v != phones_full:
|
||||||
|
r_list.append(key)
|
||||||
|
return sorted(set(r_list))
|
||||||
|
|
||||||
|
|
||||||
|
def jhymes(word):
|
||||||
|
"""Get rhymes with forced JSON mode."""
|
||||||
|
return get_rhymes(word, mode="json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test = "orange"
|
||||||
|
rhymes = get_rhymes(test)
|
||||||
|
for rhyme in rhymes:
|
||||||
|
print(rhyme)
|
||||||
114
eng_to_ipa/stress.py
Normal file
114
eng_to_ipa/stress.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import eng_to_ipa.syllables as syllables
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
def create_phones_json():
|
||||||
|
"""Creates the phones.json file in the resources directory from the phones.txt source file from CMU"""
|
||||||
|
phones_dict = {}
|
||||||
|
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
||||||
|
'resources', 'CMU_source_files', 'cmudict-0.7b.phones.txt'), encoding="UTF-8") as phones_txt:
|
||||||
|
# source link: http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
|
||||||
|
for line in phones_txt.readlines():
|
||||||
|
phones_dict[line.split(" ")[0].lower()] = line.split(" ")[1].replace("\n", "")
|
||||||
|
|
||||||
|
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
||||||
|
'resources', 'phones.json'), "w") as phones_json:
|
||||||
|
json.dump(phones_dict, phones_json)
|
||||||
|
|
||||||
|
|
||||||
|
def stress_type(stress):
|
||||||
|
"""Determine the kind of stress that should be evaluated"""
|
||||||
|
stress = stress.lower()
|
||||||
|
default = {"1": "ˈ", "2": "ˌ"}
|
||||||
|
if stress == "primary":
|
||||||
|
return {"1": "ˈ"}
|
||||||
|
elif stress == "secondary":
|
||||||
|
return {"2": "ˌ"}
|
||||||
|
elif stress == "both" or stress == "all":
|
||||||
|
return default
|
||||||
|
elif stress.lower() == "none" or not stress:
|
||||||
|
return {}
|
||||||
|
else:
|
||||||
|
logging.warning("WARNING: stress type parameter " + stress + " not recognized.")
|
||||||
|
# Use default stress
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
||||||
|
'resources', 'phones.json'), "r") as phones_json:
|
||||||
|
phones = json.load(phones_json)
|
||||||
|
|
||||||
|
|
||||||
|
def find_stress(word, type="all"):
|
||||||
|
"""Convert stress marking numbers from CMU into actual stress markings
|
||||||
|
:param word: the CMU word string to be evaluated for stress markings
|
||||||
|
:param type: type of stress to be evaluated (primary, secondary, or both)"""
|
||||||
|
|
||||||
|
syll_count = syllables.cmu_syllable_count(word)
|
||||||
|
|
||||||
|
if (not word.startswith("__IGNORE__")) and syll_count > 1:
|
||||||
|
symbols = word.split(' ')
|
||||||
|
stress_map = stress_type(type)
|
||||||
|
new_word = []
|
||||||
|
clusters = ["sp", "st", "sk", "fr", "fl"]
|
||||||
|
stop_set = ["nasal", "fricative", "vowel"] # stop searching for where stress starts if these are encountered
|
||||||
|
# for each CMU symbol
|
||||||
|
for c in symbols:
|
||||||
|
# if the last character is a 1 or 2 (that means it has stress, and we want to evaluate it)
|
||||||
|
if c[-1] in stress_map.keys():
|
||||||
|
# if the new_word list is empty
|
||||||
|
if not new_word:
|
||||||
|
# append to new_word the CMU symbol, replacing numbers with stress marks
|
||||||
|
new_word.append(re.sub(r"\d", "", stress_map[re.findall(r"\d", c)[0]] + c))
|
||||||
|
else:
|
||||||
|
stress_mark = stress_map[c[-1]]
|
||||||
|
placed = False
|
||||||
|
hiatus = False
|
||||||
|
new_word = new_word[::-1] # flip the word and backtrack through symbols
|
||||||
|
for i, sym in enumerate(new_word):
|
||||||
|
sym = re.sub(r"[0-9ˈˌ]", "", sym)
|
||||||
|
prev_sym = re.sub(r"[0-9ˈˌ]", "", new_word[i-1])
|
||||||
|
prev_phone = phones[re.sub(r"[0-9ˈˌ]", "", new_word[i-1])]
|
||||||
|
if phones[sym] in stop_set or (i > 0 and prev_phone == "stop") or sym in ["er", "w", "j"]:
|
||||||
|
if sym + prev_sym in clusters:
|
||||||
|
new_word[i] = stress_mark + new_word[i]
|
||||||
|
elif not prev_phone == "vowel" and i > 0:
|
||||||
|
new_word[i-1] = stress_mark + new_word[i-1]
|
||||||
|
else:
|
||||||
|
if phones[sym] == "vowel":
|
||||||
|
hiatus = True
|
||||||
|
new_word = [stress_mark + re.sub(r"[0-9ˈˌ]", "", c)] + new_word
|
||||||
|
else:
|
||||||
|
new_word[i] = stress_mark + new_word[i]
|
||||||
|
placed = True
|
||||||
|
break
|
||||||
|
if not placed:
|
||||||
|
if new_word:
|
||||||
|
new_word[len(new_word) - 1] = stress_mark + new_word[len(new_word) - 1]
|
||||||
|
new_word = new_word[::-1]
|
||||||
|
if not hiatus:
|
||||||
|
new_word.append(re.sub(r"\d", "", c))
|
||||||
|
else:
|
||||||
|
if c.startswith("__IGNORE__"):
|
||||||
|
new_word.append(c)
|
||||||
|
else:
|
||||||
|
new_word.append(re.sub(r"\d", "", c))
|
||||||
|
|
||||||
|
return ' '.join(new_word)
|
||||||
|
else:
|
||||||
|
if word.startswith("__IGNORE__"):
|
||||||
|
return word
|
||||||
|
else:
|
||||||
|
return re.sub(r"[0-9]", "", word)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# create phones dictionary from source if not found in the resources directory
|
||||||
|
if not os.path.isfile(os.path.join(
|
||||||
|
os.path.abspath(os.path.dirname(__file__)),
|
||||||
|
'resources', 'phones.json')):
|
||||||
|
create_phones_json()
|
||||||
38
eng_to_ipa/syllables.py
Normal file
38
eng_to_ipa/syllables.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import re
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from eng_to_ipa import transcribe
|
||||||
|
|
||||||
|
|
||||||
|
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
||||||
|
'resources', 'phones.json'), "r", encoding="UTF-8") as phones_json:
|
||||||
|
PHONES = json.load(phones_json)
|
||||||
|
|
||||||
|
# list of adjacent vowel symbols that constitute separate nuclei
|
||||||
|
hiatus = [["er", "iy"], ["iy", "ow"], ["uw", "ow"], ["iy", "ah"], ["iy", "ey"], ["uw", "eh"], ["er", "eh"]]
|
||||||
|
|
||||||
|
|
||||||
|
def cmu_syllable_count(word):
|
||||||
|
"""count syllables based on CMU transcription"""
|
||||||
|
word = re.sub(r"\d", "", word).split(' ')
|
||||||
|
if "__IGNORE__" in word[0]:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
nuclei = 0
|
||||||
|
for i, sym in enumerate(word):
|
||||||
|
prev_phone = PHONES[word[i-1]]
|
||||||
|
prev_sym = word[i-1]
|
||||||
|
if PHONES[sym] == 'vowel':
|
||||||
|
if i > 0 and not prev_phone == 'vowel' or i == 0:
|
||||||
|
nuclei += 1
|
||||||
|
elif [prev_sym, sym] in hiatus:
|
||||||
|
nuclei += 1
|
||||||
|
return nuclei
|
||||||
|
|
||||||
|
|
||||||
|
def syllable_count(word: str, db_type="sql"):
|
||||||
|
"""transcribes a regular word to CMU to fetch syllable count"""
|
||||||
|
if len(word.split()) > 1:
|
||||||
|
return [syllable_count(w) for w in word.split()]
|
||||||
|
word = transcribe.get_cmu([transcribe.preprocess(word)], db_type=db_type)
|
||||||
|
return cmu_syllable_count(word[0][0])
|
||||||
225
eng_to_ipa/transcribe.py
Normal file
225
eng_to_ipa/transcribe.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
from os.path import join, abspath, dirname
|
||||||
|
import eng_to_ipa.stress as stress
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
class ModeType(object):
|
||||||
|
|
||||||
|
def __init__(self, mode):
|
||||||
|
self.name = mode
|
||||||
|
if mode.lower() == "sql":
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(join(abspath(dirname(__file__)),
|
||||||
|
"./resources/CMU_dict.db"))
|
||||||
|
self.mode = conn.cursor()
|
||||||
|
elif mode.lower() == "json":
|
||||||
|
import json
|
||||||
|
json_file = open(join(abspath(dirname(__file__)),
|
||||||
|
"../eng_to_ipa/resources/CMU_dict.json"),
|
||||||
|
encoding="UTF-8")
|
||||||
|
self.mode = json.load(json_file)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(words):
|
||||||
|
"""Returns a string of words stripped of punctuation"""
|
||||||
|
punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» '
|
||||||
|
return ' '.join([w.strip(punct_str).lower() for w in words.split()])
|
||||||
|
|
||||||
|
|
||||||
|
def preserve_punc(words):
|
||||||
|
"""converts words to IPA and finds punctuation before and after the word."""
|
||||||
|
words_preserved = []
|
||||||
|
for w in words.split():
|
||||||
|
punct_list = ["", preprocess(w), ""]
|
||||||
|
before = re.search(r"^([^A-Za-z0-9]+)[A-Za-z]", w)
|
||||||
|
after = re.search(r"[A-Za-z]([^A-Za-z0-9]+)$", w)
|
||||||
|
if before:
|
||||||
|
punct_list[0] = str(before.group(1))
|
||||||
|
if after:
|
||||||
|
punct_list[2] = str(after.group(1))
|
||||||
|
words_preserved.append(punct_list)
|
||||||
|
return words_preserved
|
||||||
|
|
||||||
|
|
||||||
|
def apply_punct(triple, as_str=False):
|
||||||
|
"""places surrounding punctuation back on center on a list of preserve_punc triples"""
|
||||||
|
if type(triple[0]) == list:
|
||||||
|
for i, t in enumerate(triple):
|
||||||
|
triple[i] = str(''.join(triple[i]))
|
||||||
|
if as_str:
|
||||||
|
return ' '.join(triple)
|
||||||
|
return triple
|
||||||
|
if as_str:
|
||||||
|
return str(''.join(t for t in triple))
|
||||||
|
return [''.join(t for t in triple)]
|
||||||
|
|
||||||
|
|
||||||
|
def _punct_replace_word(original, transcription):
|
||||||
|
"""Get the IPA transcription of word with the original punctuation marks"""
|
||||||
|
for i, trans_list in enumerate(transcription):
|
||||||
|
for j, item in enumerate(trans_list):
|
||||||
|
triple = [original[i][0]] + [item] + [original[i][2]]
|
||||||
|
transcription[i][j] = apply_punct(triple, as_str=True)
|
||||||
|
return transcription
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_words(words_in, db_type="sql"):
|
||||||
|
"""fetches a list of words from the database"""
|
||||||
|
asset = ModeType(mode=db_type).mode
|
||||||
|
if db_type.lower() == "sql":
|
||||||
|
quest = "?, " * len(words_in)
|
||||||
|
asset.execute("SELECT word, phonemes FROM dictionary "
|
||||||
|
"WHERE word IN ({0})".format(quest[:-2]), words_in)
|
||||||
|
result = asset.fetchall()
|
||||||
|
d = defaultdict(list)
|
||||||
|
for k, v in result:
|
||||||
|
d[k].append(v)
|
||||||
|
return list(d.items())
|
||||||
|
if db_type.lower() == "json":
|
||||||
|
words = []
|
||||||
|
for k, v in asset.items():
|
||||||
|
if k in words_in:
|
||||||
|
words.append((k, v))
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def get_cmu(tokens_in, db_type="sql"):
|
||||||
|
"""query the SQL database for the words and return the phonemes in the order of user_in"""
|
||||||
|
result = fetch_words(tokens_in, db_type)
|
||||||
|
ordered = []
|
||||||
|
for word in tokens_in:
|
||||||
|
this_word = [[i[1] for i in result if i[0] == word]][0]
|
||||||
|
if this_word:
|
||||||
|
ordered.append(this_word[0])
|
||||||
|
else:
|
||||||
|
ordered.append(["__IGNORE__" + word])
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def cmu_to_ipa(cmu_list, mark=True, stress_marking='all'):
|
||||||
|
"""converts the CMU word lists into IPA transcriptions"""
|
||||||
|
# cmu_list = [[x[0].replace("ah1", "q1").replace("ah0", "+0")] for x in cmu_list]
|
||||||
|
for i in range(0, len(cmu_list)):
|
||||||
|
for j in range(0, len(cmu_list[i])):
|
||||||
|
cmu_list[i][j] = cmu_list[i][j].replace("ah1", "q1").replace("ah0", "+0")
|
||||||
|
symbols = {"a": "ə", "ey": "eɪ", "aa": "ɑ", "ae": "æ", "+": "ə", "ao": "ɔ", "q": "ʌ",
|
||||||
|
"aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər",
|
||||||
|
"hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ", "ow": "oʊ", "oy": "ɔɪ",
|
||||||
|
"sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"}
|
||||||
|
final_list = [] # the final list of IPA tokens to be returned
|
||||||
|
for word_list in cmu_list:
|
||||||
|
ipa_word_list = [] # the word list for each word
|
||||||
|
for word in word_list:
|
||||||
|
if stress_marking:
|
||||||
|
word = stress.find_stress(word, type=stress_marking)
|
||||||
|
else:
|
||||||
|
if re.sub(r"\d*", "", word.replace("__IGNORE__", "")) == "":
|
||||||
|
pass # do not delete token if it's all numbers
|
||||||
|
else:
|
||||||
|
word = re.sub("[0-9]", "", word)
|
||||||
|
ipa_form = ''
|
||||||
|
if word.startswith("__IGNORE__"):
|
||||||
|
ipa_form = word.replace("__IGNORE__", "")
|
||||||
|
# mark words we couldn't transliterate with an asterisk:
|
||||||
|
if mark:
|
||||||
|
if not re.sub(r"\d*", "", ipa_form) == "":
|
||||||
|
ipa_form += "*"
|
||||||
|
else:
|
||||||
|
for piece in word.split(" "):
|
||||||
|
marked = False
|
||||||
|
unmarked = piece
|
||||||
|
if piece[0] in ["ˈ", "ˌ"]:
|
||||||
|
marked = True
|
||||||
|
mark = piece[0]
|
||||||
|
unmarked = piece[1:]
|
||||||
|
if unmarked in symbols:
|
||||||
|
if marked:
|
||||||
|
ipa_form += mark + symbols[unmarked]
|
||||||
|
else:
|
||||||
|
ipa_form += symbols[unmarked]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ipa_form += piece
|
||||||
|
swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]]
|
||||||
|
for sym in swap_list:
|
||||||
|
if not ipa_form.startswith(sym[0]):
|
||||||
|
ipa_form = ipa_form.replace(sym[0], sym[1])
|
||||||
|
ipa_word_list.append(ipa_form)
|
||||||
|
final_list.append(list(ipa_word_list))
|
||||||
|
return final_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_top(ipa_list):
|
||||||
|
"""Returns only the one result for a query. If multiple entries for words are found, only the first is used."""
|
||||||
|
return ' '.join([word_list[0] for word_list in ipa_list])
|
||||||
|
|
||||||
|
|
||||||
|
def get_all(ipa_list):
|
||||||
|
"""utilizes an algorithm to discover and return all possible combinations of IPA transcriptions"""
|
||||||
|
final_size = 1
|
||||||
|
for word_list in ipa_list:
|
||||||
|
final_size *= len(word_list)
|
||||||
|
list_all = ["" for s in range(final_size)]
|
||||||
|
for i in range(len(ipa_list)):
|
||||||
|
if i == 0:
|
||||||
|
swtich_rate = final_size / len(ipa_list[i])
|
||||||
|
else:
|
||||||
|
swtich_rate /= len(ipa_list[i])
|
||||||
|
k = 0
|
||||||
|
for j in range(final_size):
|
||||||
|
if (j+1) % int(swtich_rate) == 0:
|
||||||
|
k += 1
|
||||||
|
if k == len(ipa_list[i]):
|
||||||
|
k = 0
|
||||||
|
list_all[j] = list_all[j] + ipa_list[i][k] + " "
|
||||||
|
return sorted([sent[:-1] for sent in list_all])
|
||||||
|
|
||||||
|
|
||||||
|
def ipa_list(words_in, keep_punct=True, stress_marks='both', db_type="sql"):
|
||||||
|
"""Returns a list of all the discovered IPA transcriptions for each word."""
|
||||||
|
words = [preserve_punc(w.lower())[0] for w in words_in.split()] \
|
||||||
|
if type(words_in) == str else [preserve_punc(w.lower())[0] for w in words_in]
|
||||||
|
cmu = get_cmu([w[1] for w in words], db_type=db_type)
|
||||||
|
ipa = cmu_to_ipa(cmu, stress_marking=stress_marks)
|
||||||
|
if keep_punct:
|
||||||
|
ipa = _punct_replace_word(words, ipa)
|
||||||
|
return ipa
|
||||||
|
|
||||||
|
|
||||||
|
def isin_cmu(word, db_type="sql"):
|
||||||
|
"""checks if a word is in the CMU dictionary. Doesn't strip punctuation.
|
||||||
|
If given more than one word, returns True only if all words are present."""
|
||||||
|
if type(word) == str:
|
||||||
|
word = [preprocess(w) for w in word.split()]
|
||||||
|
results = fetch_words(word, db_type)
|
||||||
|
as_set = list(set(t[0] for t in results))
|
||||||
|
return len(as_set) == len(set(word))
|
||||||
|
|
||||||
|
|
||||||
|
def contains(ipa, db_type="sql"):
|
||||||
|
"""Get any words that contain the IPA string. Returns the word and the IPA as a list."""
|
||||||
|
asset = ModeType(mode=db_type).mode
|
||||||
|
if db_type.lower() == "sql":
|
||||||
|
asset.execute("SELECT word, ipa FROM eng_ipa WHERE "
|
||||||
|
"REPLACE(REPLACE(ipa, 'ˌ', ''), 'ˈ', '') "
|
||||||
|
"LIKE \"%{}%\"".format(str(ipa)))
|
||||||
|
return [list(res) for res in asset.fetchall()]
|
||||||
|
|
||||||
|
|
||||||
|
def convert(text, retrieve_all=False, keep_punct=True, stress_marks='both', mode="sql"):
|
||||||
|
"""takes either a string or list of English words and converts them to IPA"""
|
||||||
|
ipa = ipa_list(words_in=text, keep_punct=keep_punct,
|
||||||
|
stress_marks=stress_marks, db_type=mode)
|
||||||
|
return get_all(ipa) if retrieve_all else get_top(ipa)
|
||||||
|
|
||||||
|
|
||||||
|
def jonvert(text, retrieve_all=False, keep_punct=True, stress_marks='both'):
|
||||||
|
"""Forces use of JSON database for fetching phoneme data."""
|
||||||
|
return convert(text, retrieve_all, keep_punct, stress_marks, mode="json")
|
||||||
|
|
||||||
|
|
||||||
@@ -7,7 +7,7 @@ from vacore import VACore
|
|||||||
def start(core:VACore):
|
def start(core:VACore):
|
||||||
manifest = {
|
manifest = {
|
||||||
"name": "Core plugin",
|
"name": "Core plugin",
|
||||||
"version": "4.4",
|
"version": "4.5",
|
||||||
"description": "Плагин с основными настройками Ирины.\nПосмотрите другие плагины, чтобы понять, какие команды можно использовать.",
|
"description": "Плагин с основными настройками Ирины.\nПосмотрите другие плагины, чтобы понять, какие команды можно использовать.",
|
||||||
|
|
||||||
"options_label": {
|
"options_label": {
|
||||||
@@ -82,7 +82,7 @@ def start(core:VACore):
|
|||||||
"log_file_level": "DEBUG", # NOTSET | DEBUG | INFO | WARNING | ERROR | CRITICAL
|
"log_file_level": "DEBUG", # NOTSET | DEBUG | INFO | WARNING | ERROR | CRITICAL
|
||||||
"log_file_name": "log.txt", # имя лог-файла
|
"log_file_name": "log.txt", # имя лог-файла
|
||||||
|
|
||||||
"normalization_engine": "numbers", # нормализация текста для русских TTS.
|
"normalization_engine": "default", # нормализация текста для русских TTS.
|
||||||
# Добавляется плагинами. Рекомендуется runorm для качества (но runorm тяжела в обработке)
|
# Добавляется плагинами. Рекомендуется runorm для качества (но runorm тяжела в обработке)
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -128,6 +128,8 @@ def start_with_options(core:VACore, manifest:dict):
|
|||||||
lingua_franca.load_language(options["linguaFrancaLang"])
|
lingua_franca.load_language(options["linguaFrancaLang"])
|
||||||
|
|
||||||
core.normalization_engine = options["normalization_engine"]
|
core.normalization_engine = options["normalization_engine"]
|
||||||
|
if core.normalization_engine == "default":
|
||||||
|
core.normalization_engine = "prepare"
|
||||||
|
|
||||||
# Логирование
|
# Логирование
|
||||||
core.log_console = options["log_console"]
|
core.log_console = options["log_console"]
|
||||||
|
|||||||
Reference in New Issue
Block a user