2021-03-11 13:32:26 +02:00
|
|
|
|
2023-10-30 06:03:00 +02:00
|
|
|
from functools import lru_cache
|
2023-07-09 12:29:11 +02:00
|
|
|
|
2023-10-30 06:20:11 +02:00
|
|
|
from argostranslate import translate
|
|
|
|
|
2023-10-30 06:03:00 +02:00
|
|
|
from libretranslate.detect import Detector
|
2020-12-20 00:40:37 +02:00
|
|
|
|
2022-03-04 17:24:29 +02:00
|
|
|
__languages = None
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2022-03-04 10:23:11 +02:00
|
|
|
def load_languages():
|
2022-03-04 17:24:29 +02:00
|
|
|
global __languages
|
|
|
|
|
|
|
|
if __languages is None or len(__languages) == 0:
|
2022-04-30 13:15:54 +02:00
|
|
|
__languages = translate.get_installed_languages()
|
2022-03-04 17:24:29 +02:00
|
|
|
|
|
|
|
return __languages
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2023-10-30 06:03:00 +02:00
|
|
|
@lru_cache(maxsize=None)
|
|
|
|
def load_lang_codes():
|
|
|
|
languages = load_languages()
|
2023-10-30 18:52:33 +02:00
|
|
|
return tuple(l.code for l in languages)
|
2023-10-30 06:03:00 +02:00
|
|
|
|
2021-03-11 11:01:12 +02:00
|
|
|
def detect_languages(text):
|
2021-03-11 11:52:38 +02:00
|
|
|
# detect batch processing
|
|
|
|
if isinstance(text, list):
|
|
|
|
is_batch = True
|
|
|
|
else:
|
|
|
|
is_batch = False
|
|
|
|
text = [text]
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2023-10-30 06:03:00 +02:00
|
|
|
lang_codes = load_lang_codes()
|
|
|
|
|
2021-03-11 11:01:12 +02:00
|
|
|
# get the candidates
|
2021-03-11 11:52:38 +02:00
|
|
|
candidates = []
|
|
|
|
for t in text:
|
2021-03-12 17:53:09 +02:00
|
|
|
try:
|
2023-10-30 06:03:00 +02:00
|
|
|
d = Detector(lang_codes).detect(t)
|
2021-08-02 07:06:56 +02:00
|
|
|
for i in range(len(d)):
|
|
|
|
d[i].text_length = len(t)
|
|
|
|
candidates.extend(d)
|
2023-10-30 06:09:52 +02:00
|
|
|
except Exception as e:
|
|
|
|
print(str(e))
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2021-03-11 11:52:38 +02:00
|
|
|
# total read bytes of the provided text
|
2021-08-02 07:06:56 +02:00
|
|
|
text_length_total = sum(c.text_length for c in candidates)
|
2021-03-11 11:52:38 +02:00
|
|
|
|
|
|
|
# this happens if no language could be detected
|
2023-10-30 06:03:00 +02:00
|
|
|
if not candidates:
|
2021-03-11 11:01:12 +02:00
|
|
|
# use language "en" by default but with zero confidence
|
2021-05-18 05:41:02 +02:00
|
|
|
return [{"confidence": 0.0, "language": "en"}]
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2021-03-11 11:52:38 +02:00
|
|
|
# for multiple occurrences of the same language (can happen on batch detection)
|
|
|
|
# calculate the average confidence for each language
|
|
|
|
if is_batch:
|
|
|
|
temp_average_list = []
|
2022-03-04 17:24:29 +02:00
|
|
|
for lang_code in lang_codes:
|
2021-03-11 11:52:38 +02:00
|
|
|
# get all candidates for a specific language
|
2023-10-30 06:03:00 +02:00
|
|
|
lc = list(filter(lambda l: l.code == lang_code, candidates))
|
2021-03-11 11:52:38 +02:00
|
|
|
if len(lc) > 1:
|
|
|
|
# if more than one is present, calculate the average confidence
|
|
|
|
lang = lc[0]
|
|
|
|
lang.confidence = sum(l.confidence for l in lc) / len(lc)
|
2021-08-02 07:06:56 +02:00
|
|
|
lang.text_length = sum(l.text_length for l in lc)
|
2021-03-11 11:52:38 +02:00
|
|
|
temp_average_list.append(lang)
|
|
|
|
elif lc:
|
|
|
|
# otherwise just add it to the temporary list
|
|
|
|
temp_average_list.append(lc[0])
|
|
|
|
|
|
|
|
if temp_average_list:
|
|
|
|
# replace the list
|
2023-10-30 06:03:00 +02:00
|
|
|
candidates = temp_average_list
|
2021-03-11 11:52:38 +02:00
|
|
|
|
2021-03-11 11:01:12 +02:00
|
|
|
# sort the candidates descending based on the detected confidence
|
2023-10-30 06:03:00 +02:00
|
|
|
candidates.sort(
|
2024-07-16 22:03:22 +02:00
|
|
|
key=lambda l: 0 if text_length_total == 0 else (l.confidence * l.text_length) / text_length_total, reverse=True
|
2021-05-18 05:41:02 +02:00
|
|
|
)
|
2021-03-11 11:01:12 +02:00
|
|
|
|
2023-10-30 06:03:00 +02:00
|
|
|
return [{"confidence": l.confidence, "language": l.code} for l in candidates]
|
2021-03-11 13:32:26 +02:00
|
|
|
|
|
|
|
|
2023-12-12 00:14:27 +02:00
|
|
|
def improve_translation_formatting(source, translation, improve_punctuation=True, remove_single_word_duplicates=True):
|
2022-09-23 13:59:13 +02:00
|
|
|
source = source.strip()
|
|
|
|
|
2022-12-09 23:36:12 +02:00
|
|
|
if not len(source):
|
2022-09-23 13:59:13 +02:00
|
|
|
return ""
|
2023-07-09 12:38:03 +02:00
|
|
|
|
2022-12-09 23:36:12 +02:00
|
|
|
if not len(translation):
|
|
|
|
return source
|
2023-07-09 12:38:03 +02:00
|
|
|
|
2022-09-23 13:59:13 +02:00
|
|
|
if improve_punctuation:
|
|
|
|
source_last_char = source[len(source) - 1]
|
|
|
|
translation_last_char = translation[len(translation) - 1]
|
|
|
|
|
2023-07-17 19:59:55 +02:00
|
|
|
punctuation_chars = ['!', '?', '.', ',', ';', '。']
|
2022-09-23 13:59:13 +02:00
|
|
|
if source_last_char in punctuation_chars:
|
|
|
|
if translation_last_char != source_last_char:
|
|
|
|
if translation_last_char in punctuation_chars:
|
|
|
|
translation = translation[:-1]
|
|
|
|
|
|
|
|
translation += source_last_char
|
|
|
|
elif translation_last_char in punctuation_chars:
|
|
|
|
translation = translation[:-1]
|
|
|
|
|
2023-12-12 00:14:27 +02:00
|
|
|
# A workaround for certain language models that output
|
|
|
|
# the single word repeated ad-infinitum (the "salad" bug)
|
|
|
|
# https://github.com/LibreTranslate/LibreTranslate/issues/46
|
|
|
|
if remove_single_word_duplicates:
|
|
|
|
if len(source) < 20 and source.count(" ") == 0 and translation.count(" ") > 0:
|
|
|
|
bow = translation.split()
|
|
|
|
count = {}
|
|
|
|
for word in bow:
|
|
|
|
count[word] = count.get(word, 0) + 1
|
|
|
|
|
|
|
|
for word in count:
|
|
|
|
if count[word] / len(count) >= 2:
|
|
|
|
translation = bow[0]
|
|
|
|
break
|
|
|
|
|
2022-09-23 13:59:13 +02:00
|
|
|
if source.islower():
|
|
|
|
return translation.lower()
|
|
|
|
|
|
|
|
if source.isupper():
|
|
|
|
return translation.upper()
|
|
|
|
|
2023-10-30 19:09:39 +02:00
|
|
|
if len(translation) == 0:
|
|
|
|
return source
|
|
|
|
|
2022-09-23 13:59:13 +02:00
|
|
|
if source[0].islower():
|
|
|
|
return translation[0].lower() + translation[1:]
|
|
|
|
|
|
|
|
if source[0].isupper():
|
|
|
|
return translation[0].upper() + translation[1:]
|
|
|
|
|
|
|
|
return translation
|
|
|
|
|