11.1.0 Добавлен плагин нормализации prepare от @Grayen-mail, который умеет обрабатывать допсимволы и английский текст

Он требует библиотеку eng_to_ipa, которая была добавлена в проект. Плагин нормализации prepare теперь установлен по умолчанию при настройки нормализации "default"
2025-11-23 22:45:08 +02:00 · 2025-05-17 13:41:23 +03:00
parent 1132e56df3
commit 4c58cbdb5a
14 changed files with 134273 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -358,6 +358,8 @@ https://github.com/Oknolaz/vasisualy
 AlphaCephei за прекрасную библиотеку распознавания Vosk ( https://alphacephei.com/vosk/index.ru ) 
 ## Поддержка проекта
 Основная сложность в опенсорс - это не писать код. Писать код интересно.
--- a/eng_to_ipa/LICENSE
+++ b/eng_to_ipa/LICENSE
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2018 Michael Phillips
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/eng_to_ipa/init.py
+++ b/eng_to_ipa/init.py
@@ -0,0 +1,6 @@
 from .transcribe import *
 from .stress import *
 from .rhymes import *
 from .syllables import *
 __all__ = ['transcribe', 'rhymes', 'stress', 'syllables']
--- a/eng_to_ipa/resources/CMU_dict.db
+++ b/eng_to_ipa/resources/CMU_dict.db
--- a/eng_to_ipa/resources/CMU_dict.json
+++ b/eng_to_ipa/resources/CMU_dict.json
--- a/eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.phones.txt
+++ b/eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.phones.txt
@@ -0,0 +1,39 @@
 AA	vowel
 AE	vowel
 AH	vowel
 AO	vowel
 AW	vowel
 AY	vowel
 B	stop
 CH	affricate
 D	stop
 DH	fricative
 EH	vowel
 ER	vowel
 EY	vowel
 F	fricative
 G	stop
 HH	aspirate
 IH	vowel
 IY	vowel
 JH	affricate
 K	stop
 L	liquid
 M	nasal
 N	nasal
 NG	nasal
 OW	vowel
 OY	vowel
 P	stop
 R	liquid
 S	fricative
 SH	fricative
 T	stop
 TH	fricative
 UH	vowel
 UW	vowel
 V	fricative
 W	semivowel
 Y	semivowel
 Z	fricative
 ZH	fricative
--- a/eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.txt
+++ b/eng_to_ipa/resources/CMU_source_files/cmudict-0.7b.txt
--- a/eng_to_ipa/resources/phones.json
+++ b/eng_to_ipa/resources/phones.json
@@ -0,0 +1 @@
 {"aa": "vowel", "ae": "vowel", "ah": "vowel", "q": "vowel", "+": "vowel", "ao": "vowel", "aw": "vowel", "ay": "vowel", "b": "stop", "ch": "affricate", "d": "stop", "dh": "fricative", "eh": "vowel", "er": "vowel", "ey": "vowel", "f": "fricative", "g": "stop", "hh": "aspirate", "ih": "vowel", "iy": "vowel", "jh": "affricate", "k": "stop", "l": "liquid", "m": "nasal", "n": "nasal", "ng": "nasal", "ow": "vowel", "oy": "vowel", "p": "stop", "r": "liquid", "s": "fricative", "sh": "fricative", "t": "stop", "th": "fricative", "uh": "vowel", "uw": "vowel", "v": "fricative", "w": "semivowel", "y": "semivowel", "z": "fricative", "zh": "fricative"}
--- a/eng_to_ipa/rhymes.py
+++ b/eng_to_ipa/rhymes.py
@@ -0,0 +1,42 @@
 # Simple rhyming support. Call get_rhymes() on a word to find rhymes from the CMU dictionary.
 from eng_to_ipa.transcribe import ModeType, get_cmu, preprocess
 def remove_onset(word_in):
    phone_list = get_cmu([word_in])[0][0].split(" ")
    for i, phoneme in enumerate(phone_list):
        if "1" in phoneme:
            return ' '.join(phone_list[i:])
 def get_rhymes(word, mode="sql"):
    if len(word.split()) > 1:
        return [get_rhymes(w) for w in word.split()]
    phones = remove_onset(preprocess(word))
    phones_full = get_cmu([preprocess(word)])[0][0]
    asset = ModeType(mode=mode).mode
    if mode == "sql":
        asset.execute("SELECT word, phonemes FROM dictionary WHERE phonemes " 
                      "LIKE \"%{0}\" AND NOT word=\"{1}\" ".format(phones, word) +
                      "AND NOT phonemes=\"{0}\"".format(phones_full))
        # also don't return results that are the same but spelled differently
        return sorted(list(set([r[0] for r in asset.fetchall()])))
    elif mode == "json":
        r_list = []
        for key, val in asset.items():
            for v in val:
                if v.endswith(phones) and word != key and v != phones_full:
                    r_list.append(key)
        return sorted(set(r_list))
 def jhymes(word):
    """Get rhymes with forced JSON mode."""
    return get_rhymes(word, mode="json")
 if __name__ == "__main__":
    test = "orange"
    rhymes = get_rhymes(test)
    for rhyme in rhymes:
        print(rhyme)
--- a/eng_to_ipa/stress.py
+++ b/eng_to_ipa/stress.py
@@ -0,0 +1,114 @@
 import os
 import re
 import json
 import eng_to_ipa.syllables as syllables
 import logging
 def create_phones_json():
    """Creates the phones.json file in the resources directory from the phones.txt source file from CMU"""
    phones_dict = {}
    with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
                           'resources', 'CMU_source_files', 'cmudict-0.7b.phones.txt'), encoding="UTF-8") as phones_txt:
        # source link: http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
        for line in phones_txt.readlines():
            phones_dict[line.split("	")[0].lower()] = line.split("	")[1].replace("\n", "")
    with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
                           'resources', 'phones.json'), "w") as phones_json:
        json.dump(phones_dict, phones_json)
 def stress_type(stress):
    """Determine the kind of stress that should be evaluated"""
    stress = stress.lower()
    default = {"1": "ˈ", "2": "ˌ"}
    if stress == "primary":
        return {"1": "ˈ"}
    elif stress == "secondary":
        return {"2": "ˌ"}
    elif stress == "both" or stress == "all":
        return default
    elif stress.lower() == "none" or not stress:
        return {}
    else:
        logging.warning("WARNING: stress type parameter " + stress + " not recognized.")
        # Use default stress
        return default
 with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
                       'resources', 'phones.json'), "r") as phones_json:
    phones = json.load(phones_json)
 def find_stress(word, type="all"):
    """Convert stress marking numbers from CMU into actual stress markings
    :param word: the CMU word string to be evaluated for stress markings
    :param type: type of stress to be evaluated (primary, secondary, or both)"""
    syll_count = syllables.cmu_syllable_count(word)
    if (not word.startswith("__IGNORE__")) and syll_count > 1:
        symbols = word.split(' ')
        stress_map = stress_type(type)
        new_word = []
        clusters = ["sp", "st", "sk", "fr", "fl"]
        stop_set = ["nasal", "fricative", "vowel"]  # stop searching for where stress starts if these are encountered
        # for each CMU symbol
        for c in symbols:
            # if the last character is a 1 or 2 (that means it has stress, and we want to evaluate it)
            if c[-1] in stress_map.keys():
                # if the new_word list is empty
                if not new_word:
                    # append to new_word the CMU symbol, replacing numbers with stress marks
                    new_word.append(re.sub(r"\d", "", stress_map[re.findall(r"\d", c)[0]] + c))
                else:
                    stress_mark = stress_map[c[-1]]
                    placed = False
                    hiatus = False
                    new_word = new_word[::-1]  # flip the word and backtrack through symbols
                    for i, sym in enumerate(new_word):
                        sym = re.sub(r"[0-9ˈˌ]", "", sym)
                        prev_sym = re.sub(r"[0-9ˈˌ]", "", new_word[i-1])
                        prev_phone = phones[re.sub(r"[0-9ˈˌ]", "", new_word[i-1])]
                        if phones[sym] in stop_set or (i > 0 and prev_phone == "stop") or sym in ["er", "w", "j"]:
                            if sym + prev_sym in clusters:
                                new_word[i] = stress_mark + new_word[i]
                            elif not prev_phone == "vowel" and i > 0:
                                new_word[i-1] = stress_mark + new_word[i-1]
                            else:
                                if phones[sym] == "vowel":
                                    hiatus = True
                                    new_word = [stress_mark + re.sub(r"[0-9ˈˌ]", "", c)] + new_word
                                else:
                                    new_word[i] = stress_mark + new_word[i]
                            placed = True
                            break
                    if not placed:
                        if new_word:
                            new_word[len(new_word) - 1] = stress_mark + new_word[len(new_word) - 1]
                    new_word = new_word[::-1]
                    if not hiatus:
                        new_word.append(re.sub(r"\d", "", c))
            else:
                if c.startswith("__IGNORE__"):
                    new_word.append(c)
                else:
                    new_word.append(re.sub(r"\d", "", c))
        return ' '.join(new_word)
    else:
        if word.startswith("__IGNORE__"):
            return word
        else:
            return re.sub(r"[0-9]", "", word)
 if __name__ == "__main__":
    # create phones dictionary from source if not found in the resources directory
    if not os.path.isfile(os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            'resources', 'phones.json')):
        create_phones_json()
--- a/eng_to_ipa/syllables.py
+++ b/eng_to_ipa/syllables.py
@@ -0,0 +1,38 @@
 import re
 import os
 import json
 from eng_to_ipa import transcribe
 with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),
                       'resources', 'phones.json'), "r", encoding="UTF-8") as phones_json:
    PHONES = json.load(phones_json)
 # list of adjacent vowel symbols that constitute separate nuclei
 hiatus = [["er", "iy"], ["iy", "ow"], ["uw", "ow"], ["iy", "ah"], ["iy", "ey"], ["uw", "eh"], ["er", "eh"]]
 def cmu_syllable_count(word):
    """count syllables based on CMU transcription"""
    word = re.sub(r"\d", "", word).split(' ')
    if "__IGNORE__" in word[0]:
        return 0
    else:
        nuclei = 0
        for i, sym in enumerate(word):
            prev_phone = PHONES[word[i-1]]
            prev_sym = word[i-1]
            if PHONES[sym] == 'vowel':
                if i > 0 and not prev_phone == 'vowel' or i == 0:
                    nuclei += 1
                elif [prev_sym, sym] in hiatus:
                    nuclei += 1
        return nuclei
 def syllable_count(word: str, db_type="sql"):
    """transcribes a regular word to CMU to fetch syllable count"""
    if len(word.split()) > 1:
        return [syllable_count(w) for w in word.split()]
    word = transcribe.get_cmu([transcribe.preprocess(word)], db_type=db_type)
    return cmu_syllable_count(word[0][0])
--- a/eng_to_ipa/transcribe.py
+++ b/eng_to_ipa/transcribe.py
@@ -0,0 +1,225 @@
 # -*- coding: utf-8 -*-
 import re
 from os.path import join, abspath, dirname
 import eng_to_ipa.stress as stress
 from collections import defaultdict
 class ModeType(object):
    def __init__(self, mode):
        self.name = mode
        if mode.lower() == "sql":
            import sqlite3
            conn = sqlite3.connect(join(abspath(dirname(__file__)),
                                        "./resources/CMU_dict.db"))
            self.mode = conn.cursor()
        elif mode.lower() == "json":
            import json
            json_file = open(join(abspath(dirname(__file__)),
                                  "../eng_to_ipa/resources/CMU_dict.json"),
                             encoding="UTF-8")
            self.mode = json.load(json_file)
    def __str__(self):
        return self.name
 def preprocess(words):
    """Returns a string of words stripped of punctuation"""
    punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» '
    return ' '.join([w.strip(punct_str).lower() for w in words.split()])
 def preserve_punc(words):
    """converts words to IPA and finds punctuation before and after the word."""
    words_preserved = []
    for w in words.split():
        punct_list = ["", preprocess(w), ""]
        before = re.search(r"^([^A-Za-z0-9]+)[A-Za-z]", w)
        after = re.search(r"[A-Za-z]([^A-Za-z0-9]+)$", w)
        if before:
            punct_list[0] = str(before.group(1))
        if after:
            punct_list[2] = str(after.group(1))
        words_preserved.append(punct_list)
    return words_preserved
 def apply_punct(triple, as_str=False):
    """places surrounding punctuation back on center on a list of preserve_punc triples"""
    if type(triple[0]) == list:
        for i, t in enumerate(triple):
            triple[i] = str(''.join(triple[i]))
        if as_str:
            return ' '.join(triple)
        return triple
    if as_str:
        return str(''.join(t for t in triple))
    return [''.join(t for t in triple)]
 def _punct_replace_word(original, transcription):
    """Get the IPA transcription of word with the original punctuation marks"""
    for i, trans_list in enumerate(transcription):
        for j, item in enumerate(trans_list):
            triple = [original[i][0]] + [item] + [original[i][2]]
            transcription[i][j] = apply_punct(triple, as_str=True)
    return transcription
 def fetch_words(words_in, db_type="sql"):
    """fetches a list of words from the database"""
    asset = ModeType(mode=db_type).mode
    if db_type.lower() == "sql":
        quest = "?, " * len(words_in)
        asset.execute("SELECT word, phonemes FROM dictionary "
                      "WHERE word IN ({0})".format(quest[:-2]), words_in)
        result = asset.fetchall()
        d = defaultdict(list)
        for k, v in result:
            d[k].append(v)
        return list(d.items())
    if db_type.lower() == "json":
        words = []
        for k, v in asset.items():
            if k in words_in:
                words.append((k, v))
        return words
 def get_cmu(tokens_in, db_type="sql"):
    """query the SQL database for the words and return the phonemes in the order of user_in"""
    result = fetch_words(tokens_in, db_type)
    ordered = []
    for word in tokens_in:
        this_word = [[i[1] for i in result if i[0] == word]][0]
        if this_word:
            ordered.append(this_word[0])
        else:
            ordered.append(["__IGNORE__" + word])
    return ordered
 def cmu_to_ipa(cmu_list, mark=True, stress_marking='all'):
    """converts the CMU word lists into IPA transcriptions"""
    # cmu_list = [[x[0].replace("ah1", "q1").replace("ah0", "+0")] for x in cmu_list]
    for i in range(0, len(cmu_list)):
        for j in range(0, len(cmu_list[i])):
            cmu_list[i][j] = cmu_list[i][j].replace("ah1", "q1").replace("ah0", "+0")
    symbols = {"a": "ə", "ey": "eɪ", "aa": "ɑ", "ae": "æ", "+": "ə", "ao": "ɔ", "q": "ʌ",
               "aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər",
               "hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ",  "ow": "oʊ", "oy": "ɔɪ",
               "sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"}
    final_list = []  # the final list of IPA tokens to be returned
    for word_list in cmu_list:
        ipa_word_list = []  # the word list for each word
        for word in word_list:
            if stress_marking:
                word = stress.find_stress(word, type=stress_marking)
            else:
                if re.sub(r"\d*", "", word.replace("__IGNORE__", "")) == "":
                    pass  # do not delete token if it's all numbers
                else:
                    word = re.sub("[0-9]", "", word)
            ipa_form = ''
            if word.startswith("__IGNORE__"):
                ipa_form = word.replace("__IGNORE__", "")
                # mark words we couldn't transliterate with an asterisk:
                if mark:
                    if not re.sub(r"\d*", "", ipa_form) == "":
                        ipa_form += "*"
            else:
                for piece in word.split(" "):
                    marked = False
                    unmarked = piece
                    if piece[0] in ["ˈ", "ˌ"]:
                        marked = True
                        mark = piece[0]
                        unmarked = piece[1:]
                    if unmarked in symbols:
                        if marked:
                            ipa_form += mark + symbols[unmarked]
                        else:
                            ipa_form += symbols[unmarked]
                    else:
                        ipa_form += piece
            swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]]
            for sym in swap_list:
                if not ipa_form.startswith(sym[0]):
                    ipa_form = ipa_form.replace(sym[0], sym[1])
            ipa_word_list.append(ipa_form)
        final_list.append(list(ipa_word_list))
    return final_list
 def get_top(ipa_list):
    """Returns only the one result for a query. If multiple entries for words are found, only the first is used."""
    return ' '.join([word_list[0] for word_list in ipa_list])
 def get_all(ipa_list):
    """utilizes an algorithm to discover and return all possible combinations of IPA transcriptions"""
    final_size = 1
    for word_list in ipa_list:
        final_size *= len(word_list)
    list_all = ["" for s in range(final_size)]
    for i in range(len(ipa_list)):
        if i == 0:
            swtich_rate = final_size / len(ipa_list[i])
        else:
            swtich_rate /= len(ipa_list[i])
        k = 0
        for j in range(final_size):
            if (j+1) % int(swtich_rate) == 0:
                k += 1
            if k == len(ipa_list[i]):
                k = 0
            list_all[j] = list_all[j] + ipa_list[i][k] + " "
    return sorted([sent[:-1] for sent in list_all])
 def ipa_list(words_in, keep_punct=True, stress_marks='both', db_type="sql"):
    """Returns a list of all the discovered IPA transcriptions for each word."""
    words = [preserve_punc(w.lower())[0] for w in words_in.split()] \
        if type(words_in) == str else [preserve_punc(w.lower())[0] for w in words_in]
    cmu = get_cmu([w[1] for w in words], db_type=db_type)
    ipa = cmu_to_ipa(cmu, stress_marking=stress_marks)
    if keep_punct:
        ipa = _punct_replace_word(words, ipa)
    return ipa
 def isin_cmu(word, db_type="sql"):
    """checks if a word is in the CMU dictionary. Doesn't strip punctuation.
    If given more than one word, returns True only if all words are present."""
    if type(word) == str:
        word = [preprocess(w) for w in word.split()]
    results = fetch_words(word, db_type)
    as_set = list(set(t[0] for t in results))
    return len(as_set) == len(set(word))
 def contains(ipa, db_type="sql"):
    """Get any words that contain the IPA string. Returns the word and the IPA as a list."""
    asset = ModeType(mode=db_type).mode
    if db_type.lower() == "sql":
        asset.execute("SELECT word, ipa FROM eng_ipa WHERE "
                      "REPLACE(REPLACE(ipa, 'ˌ', ''), 'ˈ', '') "
                      "LIKE \"%{}%\"".format(str(ipa)))
        return [list(res) for res in asset.fetchall()]
 def convert(text, retrieve_all=False, keep_punct=True, stress_marks='both', mode="sql"):
    """takes either a string or list of English words and converts them to IPA"""
    ipa = ipa_list(words_in=text, keep_punct=keep_punct,
                   stress_marks=stress_marks, db_type=mode)
    return get_all(ipa) if retrieve_all else get_top(ipa)
 def jonvert(text, retrieve_all=False, keep_punct=True, stress_marks='both'):
    """Forces use of JSON database for fetching phoneme data."""
    return convert(text, retrieve_all, keep_punct, stress_marks, mode="json")
--- a/plugins/core.py
+++ b/plugins/core.py
@@ -7,7 +7,7 @@ from vacore import VACore
 def start(core:VACore):
    manifest = {
        "name": "Core plugin",
-        "version": "4.4",
+        "version": "4.5",
        "description": "Плагин с основными настройками Ирины.\nПосмотрите другие плагины, чтобы понять, какие команды можно использовать.",
        "options_label": {
@@ -82,7 +82,7 @@ def start(core:VACore):
            "log_file_level": "DEBUG",  # NOTSET | DEBUG | INFO | WARNING | ERROR | CRITICAL
            "log_file_name": "log.txt",  # имя лог-файла
-            "normalization_engine": "numbers", # нормализация текста для русских TTS.
+            "normalization_engine": "default", # нормализация текста для русских TTS.
            # Добавляется плагинами. Рекомендуется runorm для качества (но runorm тяжела в обработке)
        },
@@ -128,6 +128,8 @@ def start_with_options(core:VACore, manifest:dict):
    lingua_franca.load_language(options["linguaFrancaLang"])
    core.normalization_engine = options["normalization_engine"]
    if core.normalization_engine == "default":
        core.normalization_engine = "prepare"
    # Логирование
    core.log_console = options["log_console"]
--- a/vacore.py
+++ b/vacore.py
@@ -12,7 +12,7 @@ from jaa import JaaCore
 from collections.abc import Callable
-version = "11.0.0"
+version = "11.1.0"
 import logging
		`@@ -0,0 +1 @@`
							{"aa": "vowel", "ae": "vowel", "ah": "vowel", "q": "vowel", "+": "vowel", "ao": "vowel", "aw": "vowel", "ay": "vowel", "b": "stop", "ch": "affricate", "d": "stop", "dh": "fricative", "eh": "vowel", "er": "vowel", "ey": "vowel", "f": "fricative", "g": "stop", "hh": "aspirate", "ih": "vowel", "iy": "vowel", "jh": "affricate", "k": "stop", "l": "liquid", "m": "nasal", "n": "nasal", "ng": "nasal", "ow": "vowel", "oy": "vowel", "p": "stop", "r": "liquid", "s": "fricative", "sh": "fricative", "t": "stop", "th": "fricative", "uh": "vowel", "uw": "vowel", "v": "fricative", "w": "semivowel", "y": "semivowel", "z": "fricative", "zh": "fricative"}