mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2025-01-11 17:18:16 +02:00
Merge pull request #60 from mammo0/polyglot
use polyglot for language detection and transliteration
This commit is contained in:
commit
fb031b826a
@ -1,2 +1,10 @@
|
||||
import os
|
||||
from appdirs import user_data_dir
|
||||
|
||||
# override polyglot path
|
||||
import polyglot
|
||||
polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data")
|
||||
|
||||
|
||||
from .main import main
|
||||
from .manage import manage
|
||||
|
32
app/app.py
32
app/app.py
@ -2,12 +2,9 @@ import os
|
||||
from flask import Flask, render_template, jsonify, request, abort, send_from_directory
|
||||
from flask_swagger import swagger
|
||||
from flask_swagger_ui import get_swaggerui_blueprint
|
||||
from langdetect import detect_langs
|
||||
from langdetect import DetectorFactory
|
||||
from pkg_resources import resource_filename
|
||||
from .api_keys import Database
|
||||
|
||||
DetectorFactory.seed = 0 # deterministic
|
||||
from app.language import detect_languages, transliterate
|
||||
|
||||
api_keys_db = None
|
||||
|
||||
@ -57,11 +54,6 @@ def create_app(args):
|
||||
from app.language import languages
|
||||
app = Flask(__name__)
|
||||
|
||||
# For faster access
|
||||
language_map = {}
|
||||
for l in languages:
|
||||
language_map[l.code] = l.name
|
||||
|
||||
if args.debug:
|
||||
app.config['TEMPLATES_AUTO_RELOAD'] = True
|
||||
|
||||
@ -271,19 +263,12 @@ def create_app(args):
|
||||
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
|
||||
|
||||
if source_lang == 'auto':
|
||||
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
|
||||
|
||||
if len(candidate_langs) > 0:
|
||||
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
|
||||
candidate_langs = detect_languages(q)
|
||||
|
||||
if args.debug:
|
||||
print(candidate_langs)
|
||||
|
||||
source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None)
|
||||
if not source_lang:
|
||||
source_lang = 'en'
|
||||
else:
|
||||
source_lang = 'en'
|
||||
source_lang = candidate_langs[0]["language"]
|
||||
|
||||
if args.debug:
|
||||
print("Auto detected: %s" % source_lang)
|
||||
@ -300,9 +285,9 @@ def create_app(args):
|
||||
|
||||
try:
|
||||
if batch:
|
||||
return jsonify({"translatedText": [translator.translate(text) for text in q] })
|
||||
return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] })
|
||||
else:
|
||||
return jsonify({"translatedText": translator.translate(q) })
|
||||
return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) })
|
||||
except Exception as e:
|
||||
abort(500, description="Cannot translate text: %s" % str(e))
|
||||
|
||||
@ -385,12 +370,7 @@ def create_app(args):
|
||||
if not q:
|
||||
abort(400, description="Invalid request: missing q parameter")
|
||||
|
||||
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
|
||||
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
|
||||
return jsonify([{
|
||||
'confidence': l.prob,
|
||||
'language': l.lang
|
||||
} for l in candidate_langs])
|
||||
return jsonify(detect_languages(q))
|
||||
|
||||
|
||||
@app.route("/frontend/settings")
|
||||
|
30
app/init.py
30
app/init.py
@ -2,9 +2,12 @@ import os
|
||||
from pathlib import Path
|
||||
from argostranslate import settings, package, translate
|
||||
import os, glob, shutil, zipfile
|
||||
from app.language import languages
|
||||
import polyglot
|
||||
|
||||
def boot():
|
||||
check_and_install_models()
|
||||
check_and_install_transliteration()
|
||||
|
||||
def check_and_install_models(force=False):
|
||||
if len(package.get_installed_packages()) < 2 or force:
|
||||
@ -22,5 +25,32 @@ def check_and_install_models(force=False):
|
||||
download_path = available_package.download()
|
||||
package.install_from_path(download_path)
|
||||
|
||||
# reload installed languages
|
||||
global languages
|
||||
languages = translate.load_installed_languages()
|
||||
print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages)))
|
||||
|
||||
|
||||
def check_and_install_transliteration(force=False):
|
||||
# 'en' is not a supported transliteration language
|
||||
transliteration_languages = [l.code for l in languages if l.code != "en"]
|
||||
|
||||
# check installed
|
||||
install_needed = []
|
||||
if not force:
|
||||
t_packages_path = Path(polyglot.polyglot_path) / "transliteration2"
|
||||
for lang in transliteration_languages:
|
||||
if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists():
|
||||
install_needed.append(lang)
|
||||
else:
|
||||
install_needed = transliteration_languages
|
||||
|
||||
# install the needed transliteration packages
|
||||
if install_needed:
|
||||
print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}")
|
||||
|
||||
from polyglot.downloader import Downloader
|
||||
downloader = Downloader()
|
||||
|
||||
for lang in install_needed:
|
||||
downloader.download(f"transliteration2.{lang}")
|
||||
|
121
app/language.py
121
app/language.py
@ -1,3 +1,124 @@
|
||||
import string
|
||||
|
||||
from argostranslate import translate
|
||||
from polyglot.detect.base import Detector, UnknownLanguage
|
||||
from polyglot.transliteration.base import Transliterator
|
||||
|
||||
|
||||
languages = translate.load_installed_languages()
|
||||
|
||||
|
||||
__lang_codes = [l.code for l in languages]
|
||||
|
||||
|
||||
def detect_languages(text):
|
||||
# detect batch processing
|
||||
if isinstance(text, list):
|
||||
is_batch = True
|
||||
else:
|
||||
is_batch = False
|
||||
text = [text]
|
||||
|
||||
# get the candidates
|
||||
candidates = []
|
||||
for t in text:
|
||||
try:
|
||||
candidates.extend(Detector(t).languages)
|
||||
except UnknownLanguage as e:
|
||||
pass
|
||||
|
||||
# total read bytes of the provided text
|
||||
read_bytes_total = sum(c.read_bytes for c in candidates)
|
||||
|
||||
# only use candidates that are supported by argostranslate
|
||||
candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates))
|
||||
|
||||
# this happens if no language could be detected
|
||||
if not candidate_langs:
|
||||
# use language "en" by default but with zero confidence
|
||||
return [
|
||||
{
|
||||
'confidence': 0.0,
|
||||
'language': "en"
|
||||
}
|
||||
]
|
||||
|
||||
# for multiple occurrences of the same language (can happen on batch detection)
|
||||
# calculate the average confidence for each language
|
||||
if is_batch:
|
||||
temp_average_list = []
|
||||
for lang_code in __lang_codes:
|
||||
# get all candidates for a specific language
|
||||
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
|
||||
if len(lc) > 1:
|
||||
# if more than one is present, calculate the average confidence
|
||||
lang = lc[0]
|
||||
lang.confidence = sum(l.confidence for l in lc) / len(lc)
|
||||
lang.read_bytes = sum(l.read_bytes for l in lc)
|
||||
temp_average_list.append(lang)
|
||||
elif lc:
|
||||
# otherwise just add it to the temporary list
|
||||
temp_average_list.append(lc[0])
|
||||
|
||||
if temp_average_list:
|
||||
# replace the list
|
||||
candidate_langs = temp_average_list
|
||||
|
||||
# sort the candidates descending based on the detected confidence
|
||||
candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True)
|
||||
|
||||
return [
|
||||
{
|
||||
'confidence': l.confidence,
|
||||
'language': l.code
|
||||
}
|
||||
for l in candidate_langs
|
||||
]
|
||||
|
||||
|
||||
def __transliterate_line(transliterator, line_text):
|
||||
new_text = []
|
||||
|
||||
# transliteration is done word by word
|
||||
for orig_word in line_text.split(" "):
|
||||
# remove any punctuation on the right side
|
||||
r_word = orig_word.rstrip(string.punctuation)
|
||||
r_diff = set(char for char in orig_word) - set(char for char in r_word)
|
||||
# and on the left side
|
||||
l_word = orig_word.lstrip(string.punctuation)
|
||||
l_diff = set(char for char in orig_word) - set(char for char in l_word)
|
||||
|
||||
# the actual transliteration of the word
|
||||
t_word = transliterator.transliterate(orig_word.strip(string.punctuation))
|
||||
|
||||
# if transliteration fails, default back to the original word
|
||||
if not t_word:
|
||||
t_word = orig_word
|
||||
else:
|
||||
# add back any stripped punctuation
|
||||
if r_diff:
|
||||
t_word = t_word + ''.join(r_diff)
|
||||
if l_diff:
|
||||
t_word = ''.join(l_diff) + t_word
|
||||
|
||||
new_text.append(t_word)
|
||||
|
||||
# rebuild the text
|
||||
return " ".join(new_text)
|
||||
|
||||
|
||||
def transliterate(text, target_lang="en"):
|
||||
# initialize the transliterator from polyglot
|
||||
transliterator = Transliterator(target_lang=target_lang)
|
||||
|
||||
# check for multiline string
|
||||
if "\n" in text:
|
||||
lines = []
|
||||
# process each line separate
|
||||
for line in text.split("\n"):
|
||||
lines.append(__transliterate_line(transliterator, line))
|
||||
|
||||
# rejoin multiline string
|
||||
return "\n".join(lines)
|
||||
else:
|
||||
return __transliterate_line(transliterator, text)
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from app.init import check_and_install_models
|
||||
from app.init import check_and_install_models, check_and_install_transliteration
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_and_install_models(force=True)
|
||||
check_and_install_transliteration(force=True)
|
||||
|
@ -4,5 +4,9 @@ flask-swagger==0.2.14
|
||||
flask-swagger-ui==3.36.0
|
||||
Flask-Limiter==1.4
|
||||
waitress==1.4.4
|
||||
langdetect==1.0.8
|
||||
expiringdict==1.2.1
|
||||
pyicu==2.6
|
||||
pycld2==0.41
|
||||
morfessor==2.0.6
|
||||
polyglot==16.7.4
|
||||
appdirs==1.4.4
|
||||
|
Loading…
Reference in New Issue
Block a user