1
0
mirror of https://github.com/MarkParker5/STARK.git synced 2026-04-23 19:31:10 +02:00
Files
STARK/tests/test_tools/test_dictionary.py
2025-10-27 19:19:24 +01:00

530 lines
16 KiB
Python

import pytest
from stark.tools.dictionary.dictionary import (
Dictionary,
LookupField,
LookupMode,
NameEntry,
)
from stark.tools.dictionary.models import Metadata
from stark.tools.dictionary.storage.storage_memory import (
DictionaryStorageMemory,
)
from stark.tools.dictionary.storage.storage_sqlite import (
DictionaryStorageSQLite,
)
@pytest.fixture(params=["memory", "sqlite"])
def dictionary(request) -> Dictionary:
print(f"Dictionary storage: {request.param}")
if request.param == "memory":
return Dictionary(storage=DictionaryStorageMemory())
elif request.param == "sqlite":
# Use in-memory SQLite for tests
return Dictionary(storage=DictionaryStorageSQLite(":memory:"))
raise ValueError(request.param)
def parse_lang(string: str) -> tuple[str, str]:
lang = "en"
if ":" in string:
lang, string = string.split(":", 1)
return lang, string
@pytest.mark.parametrize(
"name,lookup,data",
[
# English, Cyrillic, and German city name, all should match and return coords
("de:Nürnberg", "en:Nurnberg", {"coords": [49.45, 11.08]}),
("de:Nürnberg", "ru:Нюрнберг", {"coords": [49.45, 11.08]}),
# English hello/hallo, Cyrillic hello/hallo
("en:hello", "de:hallo", {"coords": [49.45, 11.08]}),
("ru:хеллоу", "de:hallo", {"coords": [49.45, 11.08]}),
],
)
def test_write_one_and_lookup(dictionary: Dictionary, name: str, lookup: str, data):
from pprint import pprint
pprint({"name": name, "lookup": lookup, "data": data})
# Use meta_key/meta_val for flexible assertion
lang, name = parse_lang(name)
dictionary.write_one(lang, name, metadata=data)
lang, lookup = parse_lang(lookup)
matches = list(dictionary.lookup(lookup, lang))
assert matches
assert matches[0].metadata == data
def test_write_all_and_clear(dictionary: Dictionary):
data = [
NameEntry(language_code="en", name="foo", metadata={"meta": 1}),
NameEntry(language_code="en", name="bar", metadata={"meta": 2}),
NameEntry(language_code="en", name="baz", metadata={"meta": 3}),
]
dictionary.write_all(data)
for entry in data:
matches = list(dictionary.lookup(entry.name, entry.language_code))
assert matches
assert matches[0].metadata["meta"] == entry.metadata["meta"]
dictionary.clear()
for entry in data:
assert list(dictionary.lookup(entry.name, entry.language_code)) == []
@pytest.mark.parametrize(
"entries,sentence,expected",
[
([("lorem", {"id": 10})], "foo lorem foo", [10]),
(
[
("bar baz", {"id": 29}),
],
"foo ber baz foo",
[29],
),
([("lorem", {"id": 10})], "foo lorem test lorem foo", [10, 10]),
(
[
("imagine dragons", {"id": 1}),
("linkin park", {"id": 2}),
],
"ru:включи имя джин драгонс или имейджин драгонс на колонке и потом добавь в очередь лин кин парк пожалуйста",
[1, 1, 2],
),
],
)
def test_search_in_sentence(
dictionary: Dictionary,
entries: list[tuple[str, Metadata]],
sentence: str,
expected: list[str],
):
for entry, data in entries:
lang, name = parse_lang(entry)
dictionary.write_one(lang, name, data)
lang, sentence = parse_lang(sentence)
results = list(dictionary.search_in_sentence(sentence, lang, mode=LookupMode.FUZZY))
assert results, f"{expected} not found"
found = [r.item.metadata["id"] for r in results]
print(f"{found=}")
assert found == expected
# test_write_and_lookup_multiple_languages merged into test_write_one_and_lookup
@pytest.mark.parametrize(
"query,expected",
[
("nurberg", ["nurberg", "nurburg"]),
("nurburg", ["nurburg", "nurberg"]),
],
)
def test_sorting_of_matches(dictionary: Dictionary, query: str, expected: list[str]):
lang, name = parse_lang("en:nurberg")
dictionary.write_one(lang, name, {})
lang, name = parse_lang("en:nurburg")
dictionary.write_one(lang, name, {})
matches = list(dictionary.lookup_sorted(query, "en", mode=LookupMode.FUZZY))
assert [match.name for match in matches] == expected
def test_clear_removes_all(dictionary: Dictionary):
lang, name = parse_lang("en:foo")
dictionary.write_one(lang, name, {})
dictionary.clear()
assert list(dictionary.lookup("foo", "en")) == []
@pytest.mark.parametrize(
"entries,sentence,expected",
[
# Basic suggestions (supported by current implementation)
(
[("hello", {"id": 1}), ("world", {"id": 2})],
"en:hello world",
[1, 2],
),
(
[("hello", {"id": 1}), ("world", {"id": 2})],
"en:hilo wart",
[1, 2],
),
# Advanced multilingual fuzzy matching
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:телеграм",
[2],
),
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:ледзеплин",
[4],
),
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:имя джин драгонс",
[5],
),
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:хайвей та хел",
[6],
),
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:линкольн парк",
[7],
),
(
[
("spotify", {"id": 1}),
("telegram", {"id": 2}),
("instagram", {"id": 3}),
("led zeppelin", {"id": 4}),
("imagine dragons", {"id": 5}),
("highway to hell", {"id": 6}),
("linkin park", {"id": 7}),
],
"ru:спу ти фай",
[1],
),
],
)
def test_lookup(
dictionary: Dictionary,
entries: list[tuple[str, Metadata]],
sentence: str,
expected: list[int],
):
# Load entries into dictionary
for entry, data in entries:
lang, name = parse_lang(entry)
dictionary.write_one(lang, name, data)
lang, sentence = parse_lang(sentence)
# Perform fuzzy search
results = list(dictionary.search_in_sentence(sentence, lang, mode=LookupMode.FUZZY))
assert results, f"{expected} not found"
found = [r.item.metadata["id"] for r in results]
print(f"{found=}")
assert found == expected
@pytest.mark.parametrize(
"entries,query,mode,field,expected_names",
[
# PHONETIC (default) cases
(["en:foo", "en:bar"], "foo", LookupMode.EXACT, LookupField.PHONETIC, ["foo"]),
(
["en:foobar", "en:bar"],
"foobar",
LookupMode.CONTAINS,
LookupField.PHONETIC,
["foobar", "bar"],
),
(
["en:hello", "en:world"],
"hellw",
LookupMode.FUZZY,
LookupField.PHONETIC,
["hello"],
),
(
["en:foo", "en:foobar"],
"foo",
LookupMode.AUTO,
LookupField.PHONETIC,
["foo"],
),
(
["en:foo", "en:bar"],
"fo ba",
LookupMode.AUTO,
LookupField.PHONETIC,
["foo", "bar"],
),
(
["en:hello", "en:world"],
"hellw",
LookupMode.AUTO,
LookupField.PHONETIC,
["hello"],
),
(["en:foo", "en:bar"], "baz", LookupMode.AUTO, LookupField.PHONETIC, []),
# NAME field cases (should behave the same for these examples)
(["en:foo", "en:bar"], "foo", LookupMode.EXACT, LookupField.NAME, ["foo"]),
(
["en:foobar", "en:bar"],
"foobar",
LookupMode.CONTAINS,
LookupField.NAME,
["foobar", "bar"],
),
(
["en:hello", "en:world"],
"hellp",
LookupMode.FUZZY,
LookupField.NAME,
[],
),
(["en:foo", "en:foobar"], "foo", LookupMode.AUTO, LookupField.NAME, ["foo"]),
(
["en:foo", "en:bar"],
"fo ba",
LookupMode.AUTO,
LookupField.NAME,
[],
),
(
["en:foo", "en:foobar"],
"foo",
LookupMode.AUTO,
LookupField.PHONETIC,
["foo"],
),
(
["en:hello", "en:world"],
"hellp",
LookupMode.AUTO,
LookupField.NAME,
[],
),
(["en:foo", "en:bar"], "baz", LookupMode.AUTO, LookupField.NAME, []),
],
)
def test_lookup_modes(
dictionary: Dictionary,
entries: list[str],
query: str,
mode: LookupMode,
field: LookupField,
expected_names: list[str],
):
print(f"Testing '{query}' with mode {mode} and field {field}")
dictionary.clear()
for entry in entries:
lang, name = parse_lang(entry)
dictionary.write_one(lang, name, {})
results = list(dictionary.lookup_sorted(query, "en", mode=mode, field=field))
assert [r.name for r in results] == expected_names
@pytest.mark.parametrize(
"entries,sentence,mode,expected_names,field",
[
# PHONETIC (default) cases
(
["en:dragons", "en:linkin", "en:zeppelin"],
"play dragons and linkin",
LookupMode.EXACT,
["dragons", "linkin"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"play imagine dragons and linkin park",
LookupMode.EXACT,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:dragons", "en:linkin", "en:zeppelin"],
"please play imagine dragons and then add linkin park to the queue",
LookupMode.CONTAINS,
["dragons", "linkin"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"please play imagine dragons and then add linkin park to the queue",
LookupMode.CONTAINS,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"play имя джин драгонс и лин кин парк",
LookupMode.FUZZY,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park"],
"play imagine dragns and linkin prk",
LookupMode.FUZZY,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"play imagine dragons and linkin park",
LookupMode.AUTO,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park"],
"please play imagine dragons and add linkin park",
LookupMode.AUTO,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park"],
"play имя джин драгонс и лин кин парк",
LookupMode.AUTO,
["imagine dragons", "linkin park"],
LookupField.PHONETIC,
),
(
["en:imagine dragons", "en:linkin park"],
"play something else",
LookupMode.AUTO,
[],
LookupField.PHONETIC,
),
# NAME field cases (should behave the same for these examples)
(
["en:dragons", "en:linkin", "en:zeppelin"],
"play dragons and linkin",
LookupMode.EXACT,
["dragons", "linkin"],
LookupField.NAME,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"play imagine dragons and linkin park",
LookupMode.EXACT,
["imagine dragons", "linkin park"],
LookupField.NAME,
),
(
["en:dragons", "en:linkin", "en:zeppelin"],
"please play imagine dragons and then add linkin park to the queue",
LookupMode.CONTAINS,
["dragons", "linkin"],
LookupField.NAME,
),
(
["en:imagine dragons", "en:linkin park", "en:led zeppelin"],
"please play imagine dragons and then add linkin park to the queue",
LookupMode.CONTAINS,
["imagine dragons", "linkin park"],
LookupField.NAME,
),
(
["en:imagine dragons", "en:linkin park"],
"play imagine dragons and linkin park",
LookupMode.FUZZY,
["imagine dragons", "linkin park"],
LookupField.NAME,
),
(
["en:imagine dragons", "en:linkin park"],
"play imagine dragns and linkin prk",
LookupMode.FUZZY,
["imagine dragons", "linkin park"],
LookupField.NAME,
),
(
["en:imagine dragons", "en:linkin park"],
"play linkin park and imagine dragons",
LookupMode.FUZZY,
["linkin park", "imagine dragons"],
LookupField.NAME,
),
],
)
def test_search_in_sentence_modes(
dictionary: Dictionary,
entries: list[str],
sentence: str,
mode: LookupMode,
expected_names: list[str],
field: LookupField,
):
dictionary.clear()
for entry in entries:
lang, name = parse_lang(entry)
dictionary.write_one(lang, name, {})
results = list(
dictionary.search_in_sentence_sorted(sentence, "en", mode=mode, field=field)
)
found_names = [r.item.name for r in results]
assert sorted(found_names) == sorted(expected_names)
@pytest.mark.parametrize(
"field,mode,expected",
[
(LookupField.NAME, LookupMode.AUTO, ["Qatar"]),
(LookupField.PHONETIC, LookupMode.AUTO, ["Qatar"]),
(LookupField.PHONETIC, LookupMode.FUZZY, ["Qatar", "guitar"]),
],
)
def test_lookup_field_name_vs_phonetic(dictionary, field, mode, expected):
dictionary.clear()
dictionary.write_one("en", "Qatar", {"meta": 1})
dictionary.write_one("en", "guitar", {"meta": 2})
dictionary.write_one("en", "foobar", {"meta": 3})
results = list(dictionary.lookup("Qatar", "en", mode=mode, field=field))
found_names = [r.name for r in results]
assert len(found_names) == len(expected), (
f"Expected {len(expected)} matches, but got {len(found_names)} "
f"({', '.join(found_names)})"
)
assert sorted(found_names) == sorted(expected)