LibreTranslate/libretranslate/language.py


from functools import lru_cache

from argostranslate import translate

from libretranslate.detect import Detector

__languages = None

def load_languages():
    global __languages

    if __languages is None or len(__languages) == 0:
        __languages = translate.get_installed_languages()

    return __languages

@lru_cache(maxsize=None)
def load_lang_codes():
    languages = load_languages()
    return tuple(l.code for l in languages)

def detect_languages(text):
    # detect batch processing
    if isinstance(text, list):
        is_batch = True
    else:
        is_batch = False
        text = [text]

    lang_codes = load_lang_codes()

    # get the candidates
    candidates = []
    for t in text:
        try:
            d = Detector(lang_codes).detect(t)
            for i in range(len(d)):
                d[i].text_length = len(t)
            candidates.extend(d)
        except Exception as e:
            print(str(e))

    # total read bytes of the provided text
    text_length_total = sum(c.text_length for c in candidates)

    # this happens if no language could be detected
    if not candidates:
        # use language "en" by default but with zero confidence
        return [{"confidence": 0.0, "language": "en"}]

    # for multiple occurrences of the same language (can happen on batch detection)
    # calculate the average confidence for each language
    if is_batch:
        temp_average_list = []
        for lang_code in lang_codes:
            # get all candidates for a specific language
            lc = list(filter(lambda l: l.code == lang_code, candidates))
            if len(lc) > 1:
                # if more than one is present, calculate the average confidence
                lang = lc[0]
                lang.confidence = sum(l.confidence for l in lc) / len(lc)
                lang.text_length = sum(l.text_length for l in lc)
                temp_average_list.append(lang)
            elif lc:
                # otherwise just add it to the temporary list
                temp_average_list.append(lc[0])

        if temp_average_list:
            # replace the list
            candidates = temp_average_list

    # sort the candidates descending based on the detected confidence
    candidates.sort(
        key=lambda l: 0 if text_length_total == 0 else (l.confidence * l.text_length) / text_length_total, reverse=True
    )

    return [{"confidence": l.confidence, "language": l.code} for l in candidates]


def improve_translation_formatting(source, translation, improve_punctuation=True, remove_single_word_duplicates=True):
    source = source.strip()

    if not len(source):
        return ""

    if not len(translation):
        return source

    if improve_punctuation:
        source_last_char = source[len(source) - 1]
        translation_last_char = translation[len(translation) - 1]

        punctuation_chars = ['!', '?', '.', ',', ';', '。']
        if source_last_char in punctuation_chars:
            if translation_last_char != source_last_char:
                if translation_last_char in punctuation_chars:
                    translation = translation[:-1]

                translation += source_last_char
        elif translation_last_char in punctuation_chars:
            translation = translation[:-1]

    # A workaround for certain language models that output
    # the single word repeated ad-infinitum (the "salad" bug)
    # https://github.com/LibreTranslate/LibreTranslate/issues/46
    if remove_single_word_duplicates:
        if len(source) < 20 and source.count(" ") == 0 and translation.count(" ") > 0:
            bow = translation.split()
            count = {}
            for word in bow:
                count[word] = count.get(word, 0) + 1

            for word in count:
                if count[word] / len(count) >= 2:
                    translation = bow[0]
                    break

    if source.islower():
        return translation.lower()

    if source.isupper():
        return translation.upper()

    if len(translation) == 0:
        return source

    if source[0].islower():
        return translation[0].lower() + translation[1:]

    if source[0].isupper():
        return translation[0].upper() + translation[1:]

    return translation
added transliteration before actual translation -> e.g. if the source language is Russian, argostranslate expects a cyrillic text 2021-03-11 13:32:26 +02:00
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`from functools import lru_cache`
run ruff formatting 2023-07-09 12:29:11 +02:00
Run ruff 2023-10-30 06:20:11 +02:00			`from argostranslate import translate`

Use lingua for language detection 2023-10-30 06:03:00 +02:00			`from libretranslate.detect import Detector`
First commit 2020-12-20 00:40:37 +02:00
Memoize 2022-03-04 17:24:29 +02:00			`__languages = None`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
Fix language detection error The root cause was load_installed_languages() of argostranslate being called at the top of the file instead of inside a function, this caused the list of installed languages to incorrectly be returned as an empty list. 2022-03-04 10:23:11 +02:00			`def load_languages():`
Memoize 2022-03-04 17:24:29 +02:00			`global __languages`

			`if __languages is None or len(__languages) == 0:`
Upgrade deprecated Argos Translate call - load_installed_languages has been deprecated in favor of get_installed_languages 2022-04-30 13:15:54 +02:00			`__languages = translate.get_installed_languages()`
Memoize 2022-03-04 17:24:29 +02:00
			`return __languages`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`@lru_cache(maxsize=None)`
			`def load_lang_codes():`
			`languages = load_languages()`
Add lexilang for language detection on short texts 2023-10-30 18:52:33 +02:00			`return tuple(l.code for l in languages)`
Use lingua for language detection 2023-10-30 06:03:00 +02:00
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00			`def detect_languages(text):`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`# detect batch processing`
			`if isinstance(text, list):`
			`is_batch = True`
			`else:`
			`is_batch = False`
			`text = [text]`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`lang_codes = load_lang_codes()`

use polyglot for detecting the language 2021-03-11 11:01:12 +02:00			`# get the candidates`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`candidates = []`
			`for t in text:`
Catch unknown language 2021-03-12 17:53:09 +02:00			`try:`
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`d = Detector(lang_codes).detect(t)`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`for i in range(len(d)):`
			`d[i].text_length = len(t)`
			`candidates.extend(d)`
Fix some warnings 2023-10-30 06:09:52 +02:00			`except Exception as e:`
			`print(str(e))`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`# total read bytes of the provided text`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`text_length_total = sum(c.text_length for c in candidates)`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00
			`# this happens if no language could be detected`
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`if not candidates:`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00			`# use language "en" by default but with zero confidence`
Linted with black 2021-05-18 05:41:02 +02:00			`return [{"confidence": 0.0, "language": "en"}]`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`# for multiple occurrences of the same language (can happen on batch detection)`
			`# calculate the average confidence for each language`
			`if is_batch:`
			`temp_average_list = []`
Memoize 2022-03-04 17:24:29 +02:00			`for lang_code in lang_codes:`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`# get all candidates for a specific language`
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`lc = list(filter(lambda l: l.code == lang_code, candidates))`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`if len(lc) > 1:`
			`# if more than one is present, calculate the average confidence`
			`lang = lc[0]`
			`lang.confidence = sum(l.confidence for l in lc) / len(lc)`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`lang.text_length = sum(l.text_length for l in lc)`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00			`temp_average_list.append(lang)`
			`elif lc:`
			`# otherwise just add it to the temporary list`
			`temp_average_list.append(lc[0])`

			`if temp_average_list:`
			`# replace the list`
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`candidates = temp_average_list`
allow batch processing for language detection 2021-03-11 11:52:38 +02:00
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00			`# sort the candidates descending based on the detected confidence`
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`candidates.sort(`
Fix div by zero error 2024-07-16 22:03:22 +02:00			`key=lambda l: 0 if text_length_total == 0 else (l.confidence * l.text_length) / text_length_total, reverse=True`
Linted with black 2021-05-18 05:41:02 +02:00			`)`
use polyglot for detecting the language 2021-03-11 11:01:12 +02:00
Use lingua for language detection 2023-10-30 06:03:00 +02:00			`return [{"confidence": l.confidence, "language": l.code} for l in candidates]`
added transliteration before actual translation -> e.g. if the source language is Russian, argostranslate expects a cyrillic text 2021-03-11 13:32:26 +02:00

Workaround for salad 2023-12-12 00:14:27 +02:00			`def improve_translation_formatting(source, translation, improve_punctuation=True, remove_single_word_duplicates=True):`
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`source = source.strip()`

Extend /languages to include targets 2022-12-09 23:36:12 +02:00			`if not len(source):`
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`return ""`
Fixed some ruff warnings: requests without timeout and naming not complying with PEP 2023-07-09 12:38:03 +02:00
Extend /languages to include targets 2022-12-09 23:36:12 +02:00			`if not len(translation):`
			`return source`
Fixed some ruff warnings: requests without timeout and naming not complying with PEP 2023-07-09 12:38:03 +02:00
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`if improve_punctuation:`
			`source_last_char = source[len(source) - 1]`
			`translation_last_char = translation[len(translation) - 1]`

Improve Japanese punctuation 2023-07-17 19:59:55 +02:00			`punctuation_chars = ['!', '?', '.', ',', ';', '。']`
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`if source_last_char in punctuation_chars:`
			`if translation_last_char != source_last_char:`
			`if translation_last_char in punctuation_chars:`
			`translation = translation[:-1]`

			`translation += source_last_char`
			`elif translation_last_char in punctuation_chars:`
			`translation = translation[:-1]`

Workaround for salad 2023-12-12 00:14:27 +02:00			`# A workaround for certain language models that output`
			`# the single word repeated ad-infinitum (the "salad" bug)`
			`# https://github.com/LibreTranslate/LibreTranslate/issues/46`
			`if remove_single_word_duplicates:`
			`if len(source) < 20 and source.count(" ") == 0 and translation.count(" ") > 0:`
			`bow = translation.split()`
			`count = {}`
			`for word in bow:`
			`count[word] = count.get(word, 0) + 1`

			`for word in count:`
			`if count[word] / len(count) >= 2:`
			`translation = bow[0]`
			`break`

move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`if source.islower():`
			`return translation.lower()`

			`if source.isupper():`
			`return translation.upper()`

Fix string index out of range fault 2023-10-30 19:09:39 +02:00			`if len(translation) == 0:`
			`return source`

move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`if source[0].islower():`
			`return translation[0].lower() + translation[1:]`

			`if source[0].isupper():`
			`return translation[0].upper() + translation[1:]`

			`return translation`