From 213a6e8736d12934aaaae2ab3bcddb35a445a7ed Mon Sep 17 00:00:00 2001 From: janvarev Date: Mon, 18 Apr 2022 12:55:33 +0300 Subject: [PATCH] =?UTF-8?q?v5.1=20-=20=D1=81=D0=B4=D0=B5=D0=BB=D0=B0=D0=BD?= =?UTF-8?q?=D0=B0=20=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D1=8F=20core.all?= =?UTF-8?q?=5Fnum=5Fto=5Ftext(text),=20=D0=BF=D0=BE=D0=B7=D0=B2=D0=BE?= =?UTF-8?q?=D0=BB=D1=8F=D1=8E=D1=89=D0=B0=D1=8F=20=D0=BA=D0=BE=D0=BD=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D1=82=D1=8C=20?= =?UTF-8?q?=D0=B2=D1=81=D0=B5=20=D1=87=D0=B8=D1=81=D0=BB=D0=B0=20=D0=B2=20?= =?UTF-8?q?=D1=82=D0=B5=D0=BA=D1=81=D1=82=D0=B5=20=D0=B4=D0=BB=D1=8F=20?= =?UTF-8?q?=D0=BF=D1=80=D0=BE=D0=B8=D0=B7=D0=BD=D0=BE=D1=88=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F.=20=D0=9E=D1=87=D0=B5=D0=BD=D1=8C=20=D0=BD=D1=83?= =?UTF-8?q?=D0=B6=D0=BD=D0=B0=20=D0=B4=D0=BB=D1=8F=20=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D1=8B=20TTS=20silero.=20=D0=9E=D0=BF=D0=B8=D1=80?= =?UTF-8?q?=D0=B0=D0=B5=D1=82=D1=81=D1=8F=20=D0=BD=D0=B0=20=D0=BD=D0=B0?= =?UTF-8?q?=D0=BF=D0=B8=D1=81=D0=B0=D0=BD=D0=BD=D1=83=D1=8E=20utils/all=5F?= =?UTF-8?q?num=5Fto=5Ftext.=20plugin=5Ftts=5Fsilero=5Fv3.py=20-=20=D0=BE?= =?UTF-8?q?=D0=B1=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20=D1=82=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D1=82=D0=B0=20-=20=D0=BA=D0=BE=D0=BD=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D1=82=D0=B0=D1=86=D0=B8=D1=8F=20=D1=87=D0=B8=D1=81=D0=B5?= =?UTF-8?q?=D0=BB=20=D0=B2=20=D1=81=D1=82=D1=80=D0=BE=D0=BA=D1=83.=20?= =?UTF-8?q?=D0=9F=D0=B0=D1=80=D0=B0=D0=BC=D0=B5=D1=82=D1=80=D1=8B=20=D1=80?= =?UTF-8?q?=D0=B0=D1=81=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BA=D0=B8=20?= =?UTF-8?q?=D0=B0=D0=BA=D1=86=D0=B5=D0=BD=D1=82=D0=B0=20=D0=B8=20"=D1=91"?= =?UTF-8?q?=20=D0=B2=D1=8B=D0=BD=D0=B5=D1=81=D0=B5=D0=BD=D1=8B=20=D0=B2=20?= =?UTF-8?q?=D0=BE=D0=BF=D1=86=D0=B8=D0=B8=20=D0=BF=D1=80=D0=B8=D0=BA=D1=80?= =?UTF-8?q?=D1=83=D1=87=D0=B5=D0=BD=D0=B0=20=D0=B1=D0=B8=D0=B1=D0=BB=D0=B8?= =?UTF-8?q?=D0=BE=D1=82=D0=B5=D0=BA=D0=B0=20mycroftAI/lingua-franca=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=BA=D0=BE=D0=BD=D0=B2=D0=B5=D1=80=D1=82?= =?UTF-8?q?=D0=B0=D1=86=D0=B8=D0=B8=20=D1=87=D0=B8=D1=81=D0=B5=D0=BB=20?= =?UTF-8?q?=D0=B2=20=D1=81=D1=82=D1=80=D0=BE=D0=BA=D1=83.=20core.py=20-=20?= =?UTF-8?q?=D0=B8=D0=BD=D0=B8=D1=86=D0=B8=D0=B0=D0=BB=D0=B8=D0=B7=D0=B0?= =?UTF-8?q?=D1=86=D0=B8=D1=8F=20=D0=B1=D0=B8=D0=B1=D0=BB=D0=B8=D0=BE=D1=82?= =?UTF-8?q?=D0=B5=D0=BA=D0=B8=20lingua-franca?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- LICENSE | 5 +- lingua_franca/__init__.py | 6 + lingua_franca/bracket_expansion.py | 175 ++ lingua_franca/config.py | 2 + lingua_franca/format.py | 568 ++++++ lingua_franca/internal.py | 774 ++++++++ lingua_franca/lang/__init__.py | 72 + lingua_franca/lang/common_data_ca.py | 197 ++ lingua_franca/lang/common_data_cs.py | 305 +++ lingua_franca/lang/common_data_da.py | 133 ++ lingua_franca/lang/common_data_de.py | 135 ++ lingua_franca/lang/common_data_en.py | 297 +++ lingua_franca/lang/common_data_es.py | 313 +++ lingua_franca/lang/common_data_fa.py | 115 ++ lingua_franca/lang/common_data_fr.py | 98 + lingua_franca/lang/common_data_hu.py | 77 + lingua_franca/lang/common_data_it.py | 321 ++++ lingua_franca/lang/common_data_nl.py | 323 ++++ lingua_franca/lang/common_data_pl.py | 497 +++++ lingua_franca/lang/common_data_pt.py | 135 ++ lingua_franca/lang/common_data_ru.py | 304 +++ lingua_franca/lang/common_data_sl.py | 173 ++ lingua_franca/lang/common_data_sv.py | 72 + lingua_franca/lang/format_ca.py | 596 ++++++ lingua_franca/lang/format_common.py | 47 + lingua_franca/lang/format_cs.py | 389 ++++ lingua_franca/lang/format_da.py | 339 ++++ lingua_franca/lang/format_de.py | 327 ++++ lingua_franca/lang/format_en.py | 386 ++++ lingua_franca/lang/format_es.py | 269 +++ lingua_franca/lang/format_fa.py | 301 +++ lingua_franca/lang/format_fr.py | 251 +++ lingua_franca/lang/format_hu.py | 307 +++ lingua_franca/lang/format_it.py | 342 ++++ lingua_franca/lang/format_nl.py | 337 ++++ lingua_franca/lang/format_pl.py | 351 ++++ lingua_franca/lang/format_pt.py | 223 +++ lingua_franca/lang/format_ru.py | 474 +++++ lingua_franca/lang/format_sl.py | 419 ++++ lingua_franca/lang/format_sv.py | 376 ++++ lingua_franca/lang/parse_ca.py | 1132 +++++++++++ lingua_franca/lang/parse_common.py | 387 ++++ lingua_franca/lang/parse_cs.py | 1707 +++++++++++++++++ lingua_franca/lang/parse_da.py | 891 +++++++++ lingua_franca/lang/parse_de.py | 1025 ++++++++++ lingua_franca/lang/parse_en.py | 1485 ++++++++++++++ lingua_franca/lang/parse_es.py | 1110 +++++++++++ lingua_franca/lang/parse_fa.py | 381 ++++ lingua_franca/lang/parse_fr.py | 1090 +++++++++++ lingua_franca/lang/parse_hu.py | 26 + lingua_franca/lang/parse_it.py | 1171 +++++++++++ lingua_franca/lang/parse_nl.py | 1339 +++++++++++++ lingua_franca/lang/parse_pl.py | 1404 ++++++++++++++ lingua_franca/lang/parse_pt.py | 1089 +++++++++++ lingua_franca/lang/parse_ru.py | 1685 ++++++++++++++++ lingua_franca/lang/parse_sl.py | 1 + lingua_franca/lang/parse_sv.py | 922 +++++++++ lingua_franca/parse.py | 269 +++ lingua_franca/res/text/ca-es/and.word | 1 + lingua_franca/res/text/ca-es/date_time.json | 130 ++ .../res/text/ca-es/date_time_test.json | 43 + lingua_franca/res/text/ca-es/day.word | 1 + lingua_franca/res/text/ca-es/days.word | 1 + lingua_franca/res/text/ca-es/hour.word | 1 + lingua_franca/res/text/ca-es/hours.word | 1 + lingua_franca/res/text/ca-es/minute.word | 1 + lingua_franca/res/text/ca-es/minutes.word | 1 + lingua_franca/res/text/ca-es/normalize.json | 109 ++ lingua_franca/res/text/ca-es/or.word | 1 + lingua_franca/res/text/ca-es/second.word | 1 + lingua_franca/res/text/ca-es/seconds.word | 1 + lingua_franca/res/text/cs-cz/and.word | 1 + lingua_franca/res/text/cs-cz/date_time.json | 129 ++ .../res/text/cs-cz/date_time_test.json | 43 + lingua_franca/res/text/cs-cz/day.word | 1 + lingua_franca/res/text/cs-cz/days.word | 1 + lingua_franca/res/text/cs-cz/hour.word | 1 + lingua_franca/res/text/cs-cz/hours.word | 1 + lingua_franca/res/text/cs-cz/minute.word | 1 + lingua_franca/res/text/cs-cz/minutes.word | 1 + lingua_franca/res/text/cs-cz/normalize.json | 46 + lingua_franca/res/text/cs-cz/or.word | 1 + lingua_franca/res/text/cs-cz/second.word | 1 + lingua_franca/res/text/cs-cz/seconds.word | 1 + lingua_franca/res/text/da-dk/and.word | 1 + lingua_franca/res/text/da-dk/date_time.json | 132 ++ .../res/text/da-dk/date_time_test.json | 32 + lingua_franca/res/text/da-dk/day.word | 1 + lingua_franca/res/text/da-dk/days.word | 1 + lingua_franca/res/text/da-dk/hour.word | 1 + lingua_franca/res/text/da-dk/hours.word | 1 + lingua_franca/res/text/da-dk/minute.word | 1 + lingua_franca/res/text/da-dk/minutes.word | 1 + lingua_franca/res/text/da-dk/or.word | 1 + lingua_franca/res/text/da-dk/second.word | 1 + lingua_franca/res/text/da-dk/seconds.word | 1 + lingua_franca/res/text/de-de/and.word | 1 + lingua_franca/res/text/de-de/date_time.json | 136 ++ .../res/text/de-de/date_time_test.json | 43 + lingua_franca/res/text/de-de/day.word | 1 + lingua_franca/res/text/de-de/days.word | 1 + lingua_franca/res/text/de-de/hour.word | 1 + lingua_franca/res/text/de-de/hours.word | 1 + lingua_franca/res/text/de-de/minute.word | 1 + lingua_franca/res/text/de-de/minutes.word | 1 + lingua_franca/res/text/de-de/or.word | 1 + lingua_franca/res/text/de-de/second.word | 1 + lingua_franca/res/text/de-de/seconds.word | 1 + lingua_franca/res/text/en-au/date_time.json | 129 ++ .../res/text/en-au/date_time_test.json | 43 + lingua_franca/res/text/en-us/and.word | 1 + lingua_franca/res/text/en-us/date_time.json | 129 ++ .../res/text/en-us/date_time_test.json | 43 + lingua_franca/res/text/en-us/day.word | 1 + lingua_franca/res/text/en-us/days.word | 1 + lingua_franca/res/text/en-us/hour.word | 1 + lingua_franca/res/text/en-us/hours.word | 1 + lingua_franca/res/text/en-us/minute.word | 1 + lingua_franca/res/text/en-us/minutes.word | 1 + lingua_franca/res/text/en-us/normalize.json | 141 ++ lingua_franca/res/text/en-us/or.word | 1 + lingua_franca/res/text/en-us/second.word | 1 + lingua_franca/res/text/en-us/seconds.word | 1 + lingua_franca/res/text/es-es/day.word | 1 + lingua_franca/res/text/es-es/days.word | 1 + lingua_franca/res/text/es-es/hour.word | 1 + lingua_franca/res/text/es-es/hours.word | 1 + lingua_franca/res/text/es-es/minute.word | 1 + lingua_franca/res/text/es-es/minutes.word | 1 + lingua_franca/res/text/es-es/second.word | 1 + lingua_franca/res/text/es-es/seconds.word | 1 + lingua_franca/res/text/fa-ir/and.word | 1 + lingua_franca/res/text/fa-ir/date_time.json | 180 ++ .../res/text/fa-ir/date_time_test.json | 36 + lingua_franca/res/text/fa-ir/day.word | 1 + lingua_franca/res/text/fa-ir/days.word | 1 + lingua_franca/res/text/fa-ir/hour.word | 1 + lingua_franca/res/text/fa-ir/hours.word | 1 + lingua_franca/res/text/fa-ir/minute.word | 1 + lingua_franca/res/text/fa-ir/minutes.word | 1 + lingua_franca/res/text/fa-ir/or.word | 1 + lingua_franca/res/text/fa-ir/second.word | 1 + lingua_franca/res/text/fa-ir/seconds.word | 1 + lingua_franca/res/text/fr-fr/date_time.json | 147 ++ .../res/text/fr-fr/date_time_test.json | 43 + lingua_franca/res/text/fr-fr/day.word | 1 + lingua_franca/res/text/fr-fr/days.word | 1 + lingua_franca/res/text/fr-fr/hour.word | 1 + lingua_franca/res/text/fr-fr/hours.word | 1 + lingua_franca/res/text/fr-fr/minute.word | 1 + lingua_franca/res/text/fr-fr/minutes.word | 1 + lingua_franca/res/text/fr-fr/second.word | 1 + lingua_franca/res/text/fr-fr/seconds.word | 1 + lingua_franca/res/text/hu-hu/date_time.json | 132 ++ .../res/text/hu-hu/date_time_test.json | 43 + lingua_franca/res/text/it-it/date_time.json | 153 ++ .../res/text/it-it/date_time_test.json | 42 + lingua_franca/res/text/it-it/day.word | 1 + lingua_franca/res/text/it-it/days.word | 1 + lingua_franca/res/text/it-it/hour.word | 1 + lingua_franca/res/text/it-it/hours.word | 1 + lingua_franca/res/text/it-it/minute.word | 1 + lingua_franca/res/text/it-it/minutes.word | 1 + lingua_franca/res/text/it-it/second.word | 1 + lingua_franca/res/text/it-it/seconds.word | 1 + lingua_franca/res/text/nl-nl/date_time.json | 136 ++ .../res/text/nl-nl/date_time_test.json | 43 + lingua_franca/res/text/nl-nl/day.word | 1 + lingua_franca/res/text/nl-nl/days.word | 1 + lingua_franca/res/text/nl-nl/hour.word | 1 + lingua_franca/res/text/nl-nl/hours.word | 1 + lingua_franca/res/text/nl-nl/minute.word | 1 + lingua_franca/res/text/nl-nl/minutes.word | 1 + lingua_franca/res/text/nl-nl/second.word | 1 + lingua_franca/res/text/nl-nl/seconds.word | 1 + lingua_franca/res/text/pl-pl/and.word | 2 + lingua_franca/res/text/pl-pl/date_time.json | 129 ++ lingua_franca/res/text/pl-pl/day.word | 1 + lingua_franca/res/text/pl-pl/days.word | 1 + lingua_franca/res/text/pl-pl/hour.word | 1 + lingua_franca/res/text/pl-pl/hours.word | 1 + lingua_franca/res/text/pl-pl/minute.word | 1 + lingua_franca/res/text/pl-pl/minutes.word | 1 + lingua_franca/res/text/pl-pl/or.word | 2 + lingua_franca/res/text/pl-pl/second.word | 1 + lingua_franca/res/text/pl-pl/seconds.word | 1 + lingua_franca/res/text/pt-pt/normalize.json | 98 + lingua_franca/res/text/ru-ru/date_time.json | 149 ++ .../res/text/ru-ru/date_time_test.json | 43 + lingua_franca/res/text/ru-ru/day.word | 1 + lingua_franca/res/text/ru-ru/days.word | 1 + lingua_franca/res/text/ru-ru/hour.word | 1 + lingua_franca/res/text/ru-ru/hours.word | 1 + lingua_franca/res/text/ru-ru/minute.word | 1 + lingua_franca/res/text/ru-ru/minutes.word | 1 + lingua_franca/res/text/ru-ru/normalize.json | 46 + lingua_franca/res/text/ru-ru/second.word | 1 + lingua_franca/res/text/ru-ru/seconds.word | 1 + lingua_franca/res/text/sl-si/and.word | 1 + lingua_franca/res/text/sl-si/date_time.json | 123 ++ .../res/text/sl-si/date_time_test.json | 43 + lingua_franca/res/text/sl-si/day.word | 1 + lingua_franca/res/text/sl-si/days.word | 1 + lingua_franca/res/text/sl-si/hour.word | 1 + lingua_franca/res/text/sl-si/hours.word | 1 + lingua_franca/res/text/sl-si/minute.word | 1 + lingua_franca/res/text/sl-si/minutes.word | 1 + lingua_franca/res/text/sl-si/normalize.json | 44 + lingua_franca/res/text/sl-si/or.word | 1 + lingua_franca/res/text/sl-si/second.word | 1 + lingua_franca/res/text/sl-si/seconds.word | 1 + lingua_franca/res/text/sv-se/date_time.json | 129 ++ .../res/text/sv-se/date_time_test.json | 43 + lingua_franca/res/text/sv-se/day.word | 1 + lingua_franca/res/text/sv-se/days.word | 1 + lingua_franca/res/text/sv-se/hour.word | 1 + lingua_franca/res/text/sv-se/hours.word | 1 + lingua_franca/res/text/sv-se/minute.word | 1 + lingua_franca/res/text/sv-se/minutes.word | 1 + lingua_franca/res/text/sv-se/second.word | 1 + lingua_franca/res/text/sv-se/seconds.word | 1 + lingua_franca/res/text/tr-tr/day.word | 1 + lingua_franca/res/text/tr-tr/days.word | 1 + lingua_franca/res/text/tr-tr/hour.word | 1 + lingua_franca/res/text/tr-tr/hours.word | 1 + lingua_franca/res/text/tr-tr/minute.word | 1 + lingua_franca/res/text/tr-tr/minutes.word | 1 + lingua_franca/res/text/tr-tr/second.word | 1 + lingua_franca/res/text/tr-tr/seconds.word | 1 + lingua_franca/time.py | 94 + plugins/core.py | 6 +- plugins_inactive/plugin_tts_silero_v3.py | 18 +- utils/all_num_to_text.py | 39 + vacore.py | 6 +- 234 files changed, 31365 insertions(+), 11 deletions(-) create mode 100644 lingua_franca/__init__.py create mode 100644 lingua_franca/bracket_expansion.py create mode 100644 lingua_franca/config.py create mode 100644 lingua_franca/format.py create mode 100644 lingua_franca/internal.py create mode 100644 lingua_franca/lang/__init__.py create mode 100644 lingua_franca/lang/common_data_ca.py create mode 100644 lingua_franca/lang/common_data_cs.py create mode 100644 lingua_franca/lang/common_data_da.py create mode 100644 lingua_franca/lang/common_data_de.py create mode 100644 lingua_franca/lang/common_data_en.py create mode 100644 lingua_franca/lang/common_data_es.py create mode 100644 lingua_franca/lang/common_data_fa.py create mode 100644 lingua_franca/lang/common_data_fr.py create mode 100644 lingua_franca/lang/common_data_hu.py create mode 100644 lingua_franca/lang/common_data_it.py create mode 100644 lingua_franca/lang/common_data_nl.py create mode 100644 lingua_franca/lang/common_data_pl.py create mode 100644 lingua_franca/lang/common_data_pt.py create mode 100644 lingua_franca/lang/common_data_ru.py create mode 100644 lingua_franca/lang/common_data_sl.py create mode 100644 lingua_franca/lang/common_data_sv.py create mode 100644 lingua_franca/lang/format_ca.py create mode 100644 lingua_franca/lang/format_common.py create mode 100644 lingua_franca/lang/format_cs.py create mode 100644 lingua_franca/lang/format_da.py create mode 100644 lingua_franca/lang/format_de.py create mode 100644 lingua_franca/lang/format_en.py create mode 100644 lingua_franca/lang/format_es.py create mode 100644 lingua_franca/lang/format_fa.py create mode 100644 lingua_franca/lang/format_fr.py create mode 100644 lingua_franca/lang/format_hu.py create mode 100644 lingua_franca/lang/format_it.py create mode 100644 lingua_franca/lang/format_nl.py create mode 100644 lingua_franca/lang/format_pl.py create mode 100644 lingua_franca/lang/format_pt.py create mode 100644 lingua_franca/lang/format_ru.py create mode 100644 lingua_franca/lang/format_sl.py create mode 100644 lingua_franca/lang/format_sv.py create mode 100644 lingua_franca/lang/parse_ca.py create mode 100644 lingua_franca/lang/parse_common.py create mode 100644 lingua_franca/lang/parse_cs.py create mode 100644 lingua_franca/lang/parse_da.py create mode 100644 lingua_franca/lang/parse_de.py create mode 100644 lingua_franca/lang/parse_en.py create mode 100644 lingua_franca/lang/parse_es.py create mode 100644 lingua_franca/lang/parse_fa.py create mode 100644 lingua_franca/lang/parse_fr.py create mode 100644 lingua_franca/lang/parse_hu.py create mode 100644 lingua_franca/lang/parse_it.py create mode 100644 lingua_franca/lang/parse_nl.py create mode 100644 lingua_franca/lang/parse_pl.py create mode 100644 lingua_franca/lang/parse_pt.py create mode 100644 lingua_franca/lang/parse_ru.py create mode 100644 lingua_franca/lang/parse_sl.py create mode 100644 lingua_franca/lang/parse_sv.py create mode 100644 lingua_franca/parse.py create mode 100644 lingua_franca/res/text/ca-es/and.word create mode 100644 lingua_franca/res/text/ca-es/date_time.json create mode 100644 lingua_franca/res/text/ca-es/date_time_test.json create mode 100644 lingua_franca/res/text/ca-es/day.word create mode 100644 lingua_franca/res/text/ca-es/days.word create mode 100644 lingua_franca/res/text/ca-es/hour.word create mode 100644 lingua_franca/res/text/ca-es/hours.word create mode 100644 lingua_franca/res/text/ca-es/minute.word create mode 100644 lingua_franca/res/text/ca-es/minutes.word create mode 100644 lingua_franca/res/text/ca-es/normalize.json create mode 100644 lingua_franca/res/text/ca-es/or.word create mode 100644 lingua_franca/res/text/ca-es/second.word create mode 100644 lingua_franca/res/text/ca-es/seconds.word create mode 100644 lingua_franca/res/text/cs-cz/and.word create mode 100644 lingua_franca/res/text/cs-cz/date_time.json create mode 100644 lingua_franca/res/text/cs-cz/date_time_test.json create mode 100644 lingua_franca/res/text/cs-cz/day.word create mode 100644 lingua_franca/res/text/cs-cz/days.word create mode 100644 lingua_franca/res/text/cs-cz/hour.word create mode 100644 lingua_franca/res/text/cs-cz/hours.word create mode 100644 lingua_franca/res/text/cs-cz/minute.word create mode 100644 lingua_franca/res/text/cs-cz/minutes.word create mode 100644 lingua_franca/res/text/cs-cz/normalize.json create mode 100644 lingua_franca/res/text/cs-cz/or.word create mode 100644 lingua_franca/res/text/cs-cz/second.word create mode 100644 lingua_franca/res/text/cs-cz/seconds.word create mode 100644 lingua_franca/res/text/da-dk/and.word create mode 100644 lingua_franca/res/text/da-dk/date_time.json create mode 100644 lingua_franca/res/text/da-dk/date_time_test.json create mode 100644 lingua_franca/res/text/da-dk/day.word create mode 100644 lingua_franca/res/text/da-dk/days.word create mode 100644 lingua_franca/res/text/da-dk/hour.word create mode 100644 lingua_franca/res/text/da-dk/hours.word create mode 100644 lingua_franca/res/text/da-dk/minute.word create mode 100644 lingua_franca/res/text/da-dk/minutes.word create mode 100644 lingua_franca/res/text/da-dk/or.word create mode 100644 lingua_franca/res/text/da-dk/second.word create mode 100644 lingua_franca/res/text/da-dk/seconds.word create mode 100644 lingua_franca/res/text/de-de/and.word create mode 100644 lingua_franca/res/text/de-de/date_time.json create mode 100644 lingua_franca/res/text/de-de/date_time_test.json create mode 100644 lingua_franca/res/text/de-de/day.word create mode 100644 lingua_franca/res/text/de-de/days.word create mode 100644 lingua_franca/res/text/de-de/hour.word create mode 100644 lingua_franca/res/text/de-de/hours.word create mode 100644 lingua_franca/res/text/de-de/minute.word create mode 100644 lingua_franca/res/text/de-de/minutes.word create mode 100644 lingua_franca/res/text/de-de/or.word create mode 100644 lingua_franca/res/text/de-de/second.word create mode 100644 lingua_franca/res/text/de-de/seconds.word create mode 100644 lingua_franca/res/text/en-au/date_time.json create mode 100644 lingua_franca/res/text/en-au/date_time_test.json create mode 100644 lingua_franca/res/text/en-us/and.word create mode 100644 lingua_franca/res/text/en-us/date_time.json create mode 100644 lingua_franca/res/text/en-us/date_time_test.json create mode 100644 lingua_franca/res/text/en-us/day.word create mode 100644 lingua_franca/res/text/en-us/days.word create mode 100644 lingua_franca/res/text/en-us/hour.word create mode 100644 lingua_franca/res/text/en-us/hours.word create mode 100644 lingua_franca/res/text/en-us/minute.word create mode 100644 lingua_franca/res/text/en-us/minutes.word create mode 100644 lingua_franca/res/text/en-us/normalize.json create mode 100644 lingua_franca/res/text/en-us/or.word create mode 100644 lingua_franca/res/text/en-us/second.word create mode 100644 lingua_franca/res/text/en-us/seconds.word create mode 100644 lingua_franca/res/text/es-es/day.word create mode 100644 lingua_franca/res/text/es-es/days.word create mode 100644 lingua_franca/res/text/es-es/hour.word create mode 100644 lingua_franca/res/text/es-es/hours.word create mode 100644 lingua_franca/res/text/es-es/minute.word create mode 100644 lingua_franca/res/text/es-es/minutes.word create mode 100644 lingua_franca/res/text/es-es/second.word create mode 100644 lingua_franca/res/text/es-es/seconds.word create mode 100644 lingua_franca/res/text/fa-ir/and.word create mode 100644 lingua_franca/res/text/fa-ir/date_time.json create mode 100644 lingua_franca/res/text/fa-ir/date_time_test.json create mode 100644 lingua_franca/res/text/fa-ir/day.word create mode 100644 lingua_franca/res/text/fa-ir/days.word create mode 100644 lingua_franca/res/text/fa-ir/hour.word create mode 100644 lingua_franca/res/text/fa-ir/hours.word create mode 100644 lingua_franca/res/text/fa-ir/minute.word create mode 100644 lingua_franca/res/text/fa-ir/minutes.word create mode 100644 lingua_franca/res/text/fa-ir/or.word create mode 100644 lingua_franca/res/text/fa-ir/second.word create mode 100644 lingua_franca/res/text/fa-ir/seconds.word create mode 100644 lingua_franca/res/text/fr-fr/date_time.json create mode 100644 lingua_franca/res/text/fr-fr/date_time_test.json create mode 100644 lingua_franca/res/text/fr-fr/day.word create mode 100644 lingua_franca/res/text/fr-fr/days.word create mode 100644 lingua_franca/res/text/fr-fr/hour.word create mode 100644 lingua_franca/res/text/fr-fr/hours.word create mode 100644 lingua_franca/res/text/fr-fr/minute.word create mode 100644 lingua_franca/res/text/fr-fr/minutes.word create mode 100644 lingua_franca/res/text/fr-fr/second.word create mode 100644 lingua_franca/res/text/fr-fr/seconds.word create mode 100644 lingua_franca/res/text/hu-hu/date_time.json create mode 100644 lingua_franca/res/text/hu-hu/date_time_test.json create mode 100644 lingua_franca/res/text/it-it/date_time.json create mode 100644 lingua_franca/res/text/it-it/date_time_test.json create mode 100644 lingua_franca/res/text/it-it/day.word create mode 100644 lingua_franca/res/text/it-it/days.word create mode 100644 lingua_franca/res/text/it-it/hour.word create mode 100644 lingua_franca/res/text/it-it/hours.word create mode 100644 lingua_franca/res/text/it-it/minute.word create mode 100644 lingua_franca/res/text/it-it/minutes.word create mode 100644 lingua_franca/res/text/it-it/second.word create mode 100644 lingua_franca/res/text/it-it/seconds.word create mode 100644 lingua_franca/res/text/nl-nl/date_time.json create mode 100644 lingua_franca/res/text/nl-nl/date_time_test.json create mode 100644 lingua_franca/res/text/nl-nl/day.word create mode 100644 lingua_franca/res/text/nl-nl/days.word create mode 100644 lingua_franca/res/text/nl-nl/hour.word create mode 100644 lingua_franca/res/text/nl-nl/hours.word create mode 100644 lingua_franca/res/text/nl-nl/minute.word create mode 100644 lingua_franca/res/text/nl-nl/minutes.word create mode 100644 lingua_franca/res/text/nl-nl/second.word create mode 100644 lingua_franca/res/text/nl-nl/seconds.word create mode 100644 lingua_franca/res/text/pl-pl/and.word create mode 100644 lingua_franca/res/text/pl-pl/date_time.json create mode 100644 lingua_franca/res/text/pl-pl/day.word create mode 100644 lingua_franca/res/text/pl-pl/days.word create mode 100644 lingua_franca/res/text/pl-pl/hour.word create mode 100644 lingua_franca/res/text/pl-pl/hours.word create mode 100644 lingua_franca/res/text/pl-pl/minute.word create mode 100644 lingua_franca/res/text/pl-pl/minutes.word create mode 100644 lingua_franca/res/text/pl-pl/or.word create mode 100644 lingua_franca/res/text/pl-pl/second.word create mode 100644 lingua_franca/res/text/pl-pl/seconds.word create mode 100644 lingua_franca/res/text/pt-pt/normalize.json create mode 100644 lingua_franca/res/text/ru-ru/date_time.json create mode 100644 lingua_franca/res/text/ru-ru/date_time_test.json create mode 100644 lingua_franca/res/text/ru-ru/day.word create mode 100644 lingua_franca/res/text/ru-ru/days.word create mode 100644 lingua_franca/res/text/ru-ru/hour.word create mode 100644 lingua_franca/res/text/ru-ru/hours.word create mode 100644 lingua_franca/res/text/ru-ru/minute.word create mode 100644 lingua_franca/res/text/ru-ru/minutes.word create mode 100644 lingua_franca/res/text/ru-ru/normalize.json create mode 100644 lingua_franca/res/text/ru-ru/second.word create mode 100644 lingua_franca/res/text/ru-ru/seconds.word create mode 100644 lingua_franca/res/text/sl-si/and.word create mode 100644 lingua_franca/res/text/sl-si/date_time.json create mode 100644 lingua_franca/res/text/sl-si/date_time_test.json create mode 100644 lingua_franca/res/text/sl-si/day.word create mode 100644 lingua_franca/res/text/sl-si/days.word create mode 100644 lingua_franca/res/text/sl-si/hour.word create mode 100644 lingua_franca/res/text/sl-si/hours.word create mode 100644 lingua_franca/res/text/sl-si/minute.word create mode 100644 lingua_franca/res/text/sl-si/minutes.word create mode 100644 lingua_franca/res/text/sl-si/normalize.json create mode 100644 lingua_franca/res/text/sl-si/or.word create mode 100644 lingua_franca/res/text/sl-si/second.word create mode 100644 lingua_franca/res/text/sl-si/seconds.word create mode 100644 lingua_franca/res/text/sv-se/date_time.json create mode 100644 lingua_franca/res/text/sv-se/date_time_test.json create mode 100644 lingua_franca/res/text/sv-se/day.word create mode 100644 lingua_franca/res/text/sv-se/days.word create mode 100644 lingua_franca/res/text/sv-se/hour.word create mode 100644 lingua_franca/res/text/sv-se/hours.word create mode 100644 lingua_franca/res/text/sv-se/minute.word create mode 100644 lingua_franca/res/text/sv-se/minutes.word create mode 100644 lingua_franca/res/text/sv-se/second.word create mode 100644 lingua_franca/res/text/sv-se/seconds.word create mode 100644 lingua_franca/res/text/tr-tr/day.word create mode 100644 lingua_franca/res/text/tr-tr/days.word create mode 100644 lingua_franca/res/text/tr-tr/hour.word create mode 100644 lingua_franca/res/text/tr-tr/hours.word create mode 100644 lingua_franca/res/text/tr-tr/minute.word create mode 100644 lingua_franca/res/text/tr-tr/minutes.word create mode 100644 lingua_franca/res/text/tr-tr/second.word create mode 100644 lingua_franca/res/text/tr-tr/seconds.word create mode 100644 lingua_franca/time.py create mode 100644 utils/all_num_to_text.py diff --git a/LICENSE b/LICENSE index 431edd8..4c3b57e 100644 --- a/LICENSE +++ b/LICENSE @@ -2,7 +2,7 @@ Irene - russian offline voice assistant MIT License -Copyright (c) 2021 Vladislav Janvarev +Copyright (c) 2021-2022 Vladislav Janvarev Copyright (c) 2020 EnjiRouz Permission is hereby granted, free of charge, to any person obtaining a copy @@ -54,4 +54,7 @@ timer.wav: - licensed under Creative Commons 0 License - URL: https://freesound.org/people/AlphaDarkWolf/sounds/591109/ +MycroftAI/lingua-franca: +- licensed under Apache License 2.0 +- URL: https://github.com/MycroftAI/lingua-franca diff --git a/lingua_franca/__init__.py b/lingua_franca/__init__.py new file mode 100644 index 0000000..0404403 --- /dev/null +++ b/lingua_franca/__init__.py @@ -0,0 +1,6 @@ +from .internal import get_default_lang, set_default_lang, get_default_loc, \ + get_active_langs, _set_active_langs, get_primary_lang_code, \ + get_full_lang_code, resolve_resource_file, load_language, \ + load_languages, unload_language, unload_languages, get_supported_langs + +from lingua_franca import config diff --git a/lingua_franca/bracket_expansion.py b/lingua_franca/bracket_expansion.py new file mode 100644 index 0000000..2998e7a --- /dev/null +++ b/lingua_franca/bracket_expansion.py @@ -0,0 +1,175 @@ +# Copyright 2017 Mycroft AI, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Fragment(object): + """(Abstract) empty sentence fragment""" + + def __init__(self, tree): + """ + Construct a sentence tree fragment which is merely a wrapper for + a list of Strings + Args: + tree (?): Base tree for the sentence fragment, type depends on + subclass, refer to those subclasses + """ + self._tree = tree + + def tree(self): + """Return the represented sentence tree as raw data.""" + return self._tree + + def expand(self): + """ + Expanded version of the fragment. In this case an empty sentence. + Returns: + List>: A list with an empty sentence (= token/string list) + """ + return [[]] + + def __str__(self): + return self._tree.__str__() + + def __repr__(self): + return self._tree.__repr__() + + +class Word(Fragment): + """ + Single word in the sentence tree. + Construct with a string as argument. + """ + + def expand(self): + """ + Creates one sentence that contains exactly that word. + Returns: + List>: A list with the given string as sentence + (= token/string list) + """ + return [[self._tree]] + + +class Sentence(Fragment): + """ + A Sentence made of several concatenations/words. + Construct with a List as argument. + """ + + def expand(self): + """ + Creates a combination of all sub-sentences. + Returns: + List>: A list with all subsentence expansions combined in + every possible way + """ + old_expanded = [[]] + for sub in self._tree: + sub_expanded = sub.expand() + new_expanded = [] + while len(old_expanded) > 0: + sentence = old_expanded.pop() + for new in sub_expanded: + new_expanded.append(sentence + new) + old_expanded = new_expanded + return old_expanded + + +class Options(Fragment): + """ + A Combination of possible sub-sentences. + Construct with List as argument. + """ + + def expand(self): + """ + Returns all of its options as seperated sub-sentences. + Returns: + List>: A list containing the sentences created by all + expansions of its sub-sentences + """ + options = [] + for option in self._tree: + options.extend(option.expand()) + return options + + +class SentenceTreeParser(object): + """ + Generate sentence token trees from a list of tokens + ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] + """ + + def __init__(self, tokens): + self.tokens = tokens + + def _parse(self): + """ + Generate sentence token trees + ['1', '(', '2', '|', '3, ')'] -> ['1', ['2', '3']] + """ + self._current_position = 0 + return self._parse_expr() + + def _parse_expr(self): + """ + Generate sentence token trees from the current position to + the next closing parentheses / end of the list and return it + ['1', '(', '2', '|', '3, ')'] -> ['1', [['2'], ['3']]] + ['2', '|', '3'] -> [['2'], ['3']] + """ + # List of all generated sentences + sentence_list = [] + # Currently active sentence + cur_sentence = [] + sentence_list.append(Sentence(cur_sentence)) + # Determine which form the current expression has + while self._current_position < len(self.tokens): + cur = self.tokens[self._current_position] + self._current_position += 1 + if cur == '(': + # Parse the subexpression + subexpr = self._parse_expr() + # Check if the subexpression only has one branch + # -> If so, append "(" and ")" and add it as is + normal_brackets = False + if len(subexpr.tree()) == 1: + normal_brackets = True + cur_sentence.append(Word('(')) + # add it to the sentence + cur_sentence.append(subexpr) + if normal_brackets: + cur_sentence.append(Word(')')) + elif cur == '|': + # Begin parsing a new sentence + cur_sentence = [] + sentence_list.append(Sentence(cur_sentence)) + elif cur == ')': + # End parsing the current subexpression + break + # TODO anything special about {sth}? + else: + cur_sentence.append(Word(cur)) + return Options(sentence_list) + + def _expand_tree(self, tree): + """ + Expand a list of sub sentences to all combinated sentences. + ['1', ['2', '3']] -> [['1', '2'], ['1', '3']] + """ + return tree.expand() + + def expand_parentheses(self): + tree = self._parse() + return self._expand_tree(tree) \ No newline at end of file diff --git a/lingua_franca/config.py b/lingua_franca/config.py new file mode 100644 index 0000000..4d12ab2 --- /dev/null +++ b/lingua_franca/config.py @@ -0,0 +1,2 @@ +load_langs_on_demand = False +inject_timezones = True diff --git a/lingua_franca/format.py b/lingua_franca/format.py new file mode 100644 index 0000000..7be8716 --- /dev/null +++ b/lingua_franca/format.py @@ -0,0 +1,568 @@ +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import json +import os +import re +from collections import namedtuple +from warnings import warn +from os.path import join + + +from lingua_franca.bracket_expansion import SentenceTreeParser +from lingua_franca.internal import localized_function, \ + populate_localized_function_dict, get_active_langs, \ + get_full_lang_code, get_default_lang, get_default_loc, \ + is_supported_full_lang, _raise_unsupported_language, \ + UnsupportedLanguageError, NoneLangWarning, InvalidLangWarning, \ + FunctionNotLocalizedError + + +_REGISTERED_FUNCTIONS = ("nice_number", + "nice_time", + "pronounce_number", + "nice_response", + "nice_duration") + +populate_localized_function_dict("format", langs=get_active_langs()) + + +def _translate_word(name, lang=''): + """ Helper to get word translations + + Args: + name (str): Word name. Returned as the default value if not translated + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + + Returns: + str: translated version of resource name + """ + from lingua_franca.internal import resolve_resource_file + if not lang: + if lang is None: + warn(NoneLangWarning) + lang = get_default_loc() + + lang_code = lang if is_supported_full_lang(lang) else \ + get_full_lang_code(lang) + + filename = resolve_resource_file(join("text", lang_code, name + ".word")) + if filename: + # open the file + try: + with open(filename, 'r', encoding='utf8') as f: + for line in f: + word = line.strip() + if word.startswith("#"): + continue # skip comment lines + return word + except Exception: + pass + return name # use resource name as the word + + +NUMBER_TUPLE = namedtuple( + 'number', + ('x, xx, x0, x_in_x0, xxx, x00, x_in_x00, xx00, xx_in_xx00, x000, ' + + 'x_in_x000, x0_in_x000, x_in_0x00')) + + +class DateTimeFormat: + def __init__(self, config_path): + self.lang_config = {} + self.config_path = config_path + + def cache(self, lang): + if lang not in self.lang_config: + try: + # Attempt to load the language-specific formatting data + with open(self.config_path + '/' + lang + '/date_time.json', + 'r', encoding='utf8') as lang_config_file: + self.lang_config[lang] = json.loads( + lang_config_file.read()) + except FileNotFoundError: + # Fallback to English formatting + with open(self.config_path + '/en-us/date_time.json', + 'r') as lang_config_file: + self.lang_config[lang] = json.loads( + lang_config_file.read()) + + for x in ['decade_format', 'hundreds_format', 'thousand_format', + 'year_format']: + i = 1 + while self.lang_config[lang][x].get(str(i)): + self.lang_config[lang][x][str(i)]['re'] = ( + re.compile(self.lang_config[lang][x][str(i)]['match'] + )) + i = i + 1 + + def _number_strings(self, number, lang): + x = (self.lang_config[lang]['number'].get(str(number % 10)) or + str(number % 10)) + xx = (self.lang_config[lang]['number'].get(str(number % 100)) or + str(number % 100)) + x_in_x0 = self.lang_config[lang]['number'].get( + str(int(number % 100 / 10))) or str(int(number % 100 / 10)) + x0 = (self.lang_config[lang]['number'].get( + str(int(number % 100 / 10) * 10)) or + str(int(number % 100 / 10) * 10)) + xxx = (self.lang_config[lang]['number'].get(str(number % 1000)) or + str(number % 1000)) + x00 = (self.lang_config[lang]['number'].get(str(int( + number % 1000 / 100) * 100)) or + str(int(number % 1000 / 100) * 100)) + x_in_x00 = self.lang_config[lang]['number'].get(str(int( + number % 1000 / 100))) or str(int(number % 1000 / 100)) + xx00 = self.lang_config[lang]['number'].get(str(int( + number % 10000 / 100) * 100)) or str(int(number % 10000 / 100) * + 100) + xx_in_xx00 = self.lang_config[lang]['number'].get(str(int( + number % 10000 / 100))) or str(int(number % 10000 / 100)) + x000 = (self.lang_config[lang]['number'].get(str(int( + number % 10000 / 1000) * 1000)) or + str(int(number % 10000 / 1000) * 1000)) + x_in_x000 = self.lang_config[lang]['number'].get(str(int( + number % 10000 / 1000))) or str(int(number % 10000 / 1000)) + x0_in_x000 = self.lang_config[lang]['number'].get(str(int( + number % 10000 / 1000) * 10)) or str(int(number % 10000 / 1000) * 10) + x_in_0x00 = self.lang_config[lang]['number'].get(str(int( + number % 1000 / 100)) or str(int(number % 1000 / 100))) + + return NUMBER_TUPLE( + x, xx, x0, x_in_x0, xxx, x00, x_in_x00, xx00, xx_in_xx00, x000, + x_in_x000, x0_in_x000, x_in_0x00) + + def _format_string(self, number, format_section, lang): + s = self.lang_config[lang][format_section]['default'] + i = 1 + while self.lang_config[lang][format_section].get(str(i)): + e = self.lang_config[lang][format_section][str(i)] + if e['re'].match(str(number)): + return e['format'] + i = i + 1 + return s + + def _decade_format(self, number, number_tuple, lang): + s = self._format_string(number % 100, 'decade_format', lang) + return s.format(x=number_tuple.x, xx=number_tuple.xx, + x0=number_tuple.x0, x_in_x0=number_tuple.x_in_x0, + number=str(number % 100)) + + def _number_format_hundreds(self, number, number_tuple, lang, + formatted_decade): + s = self._format_string(number % 1000, 'hundreds_format', lang) + return s.format(xxx=number_tuple.xxx, x00=number_tuple.x00, + x_in_x00=number_tuple.x_in_x00, + formatted_decade=formatted_decade, + number=str(number % 1000)) + + def _number_format_thousand(self, number, number_tuple, lang, + formatted_decade, formatted_hundreds): + s = self._format_string(number % 10000, 'thousand_format', lang) + return s.format(x_in_x00=number_tuple.x_in_x00, + xx00=number_tuple.xx00, + xx_in_xx00=number_tuple.xx_in_xx00, + x000=number_tuple.x000, + x_in_x000=number_tuple.x_in_x000, + x0_in_x000=number_tuple.x0_in_x000, + x_in_0x00=number_tuple.x_in_0x00, + formatted_decade=formatted_decade, + formatted_hundreds=formatted_hundreds, + number=str(number % 10000)) + + def date_format(self, dt, lang, now): + format_str = 'date_full' + if now: + if dt.year == now.year: + format_str = 'date_full_no_year' + if dt.month == now.month and dt.day > now.day: + format_str = 'date_full_no_year_month' + + tomorrow = now + datetime.timedelta(days=1) + yesterday = now - datetime.timedelta(days=1) + if tomorrow.date() == dt.date(): + format_str = 'tomorrow' + elif now.date() == dt.date(): + format_str = 'today' + elif yesterday.date() == dt.date(): + format_str = 'yesterday' + + return self.lang_config[lang]['date_format'][format_str].format( + weekday=self.lang_config[lang]['weekday'][str(dt.weekday())], + month=self.lang_config[lang]['month'][str(dt.month)], + day=self.lang_config[lang]['date'][str(dt.day)], + formatted_year=self.year_format(dt, lang, False)) + + def date_time_format(self, dt, lang, now, use_24hour, use_ampm): + date_str = self.date_format(dt, lang, now) + time_str = nice_time(dt, lang, use_24hour=use_24hour, + use_ampm=use_ampm) + return self.lang_config[lang]['date_time_format']['date_time'].format( + formatted_date=date_str, formatted_time=time_str) + + def year_format(self, dt, lang, bc): + number_tuple = self._number_strings(dt.year, lang) + formatted_bc = ( + self.lang_config[lang]['year_format']['bc'] if bc else '') + formatted_decade = self._decade_format( + dt.year, number_tuple, lang) + formatted_hundreds = self._number_format_hundreds( + dt.year, number_tuple, lang, formatted_decade) + formatted_thousand = self._number_format_thousand( + dt.year, number_tuple, lang, formatted_decade, formatted_hundreds) + + s = self._format_string(dt.year, 'year_format', lang) + + return re.sub(' +', ' ', + s.format( + year=str(dt.year), + century=str(int(dt.year / 100)), + decade=str(dt.year % 100), + formatted_hundreds=formatted_hundreds, + formatted_decade=formatted_decade, + formatted_thousand=formatted_thousand, + bc=formatted_bc)).strip() + + +date_time_format = DateTimeFormat(os.path.join(os.path.dirname(__file__), + 'res/text')) + + +@localized_function(run_own_code_on=[UnsupportedLanguageError]) +def nice_number(number, lang='', speech=True, denominators=None): + """Format a float to human readable functions + + This function formats a float to human understandable functions. Like + 4.5 becomes 4 and a half for speech and 4 1/2 for text + Args: + number (int or float): the float to format + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + return str(number) + + +@localized_function() +def nice_time(dt, lang='', speech=True, use_24hour=False, + use_ampm=False, variant=None): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + speech (bool): format for speech (default/True) or display (False) + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + variant (string): alternative time system to be used, string must + match language specific mappings + Returns: + (str): The formatted time string + """ + + +@localized_function() +def pronounce_number(number, lang='', places=2, short_scale=True, + scientific=False, ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5' would be 'five' + + Args: + number: the number to pronounce + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + places (int): number of decimal places to express, default 2 + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool) : convert and pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + + +def nice_date(dt, lang='', now=None): + """ + Format a datetime to a pronounceable date + + For example, generates 'tuesday, june the fifth, 2018' + + Args: + dt (datetime): date to format (assumes already in local timezone) + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + now (datetime): Current date. If provided, the returned date for speech + will be shortened accordingly: No year is returned if now is in the + same year as td, no month is returned if now is in the same month + as td. If now and td is the same day, 'today' is returned. + + Returns: + (str): The formatted date string + """ + full_code = get_full_lang_code(lang) + date_time_format.cache(full_code) + + return date_time_format.date_format(dt, full_code, now) + + +def nice_date_time(dt, lang='', now=None, use_24hour=False, + use_ampm=False): + """ + Format a datetime to a pronounceable date and time + + For example, generate 'tuesday, june the fifth, 2018 at five thirty' + + Args: + dt (datetime): date to format (assumes already in local timezone) + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + now (datetime): Current date. If provided, the returned date for + speech will be shortened accordingly: No year is returned if + now is in the same year as td, no month is returned if now is + in the same month as td. If now and td is the same day, 'today' + is returned. + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted date time string + """ + + full_code = get_full_lang_code(lang) + date_time_format.cache(full_code) + + return date_time_format.date_time_format(dt, full_code, now, use_24hour, + use_ampm) + + +def nice_year(dt, lang='', bc=False): + """ + Format a datetime to a pronounceable year + + For example, generate 'nineteen-hundred and eighty-four' for year 1984 + + Args: + dt (datetime): date to format (assumes already in local timezone) + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + bc (bool) pust B.C. after the year (python does not support dates + B.C. in datetime) + Returns: + (str): The formatted year string + """ + + full_code = get_full_lang_code(lang) + date_time_format.cache(full_code) + + return date_time_format.year_format(dt, full_code, bc) + + +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def nice_duration(duration, lang='', speech=True): + """ Convert duration in seconds to a nice spoken timespan + + Examples: + duration = 60 -> "1:00" or "one minute" + duration = 163 -> "2:43" or "two minutes forty three seconds" + + Args: + duration: time, in seconds + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + speech (bool): format for speech (True) or display (False) + + Returns: + str: timespan as a string + """ + if not lang: + if lang is None: + warn(NoneLangWarning) + lang = get_default_loc() + if not is_supported_full_lang(lang): + # TODO deprecated; delete when 'lang=None' and 'lang=invalid' are + # removed + try: + lang = get_full_lang_code(lang) + except UnsupportedLanguageError: + warn(InvalidLangWarning) + lang = get_default_loc() + + if isinstance(duration, datetime.timedelta): + duration = duration.total_seconds() + + # Do traditional rounding: 2.5->3, 3.5->4, plus this + # helps in a few cases of where calculations generate + # times like 2:59:59.9 instead of 3:00. + duration += 0.5 + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + if speech: + out = "" + if days > 0: + out += pronounce_number(days, lang) + " " + if days == 1: + out += _translate_word("day", lang) + else: + out += _translate_word("days", lang) + out += " " + if hours > 0: + if out: + out += " " + out += pronounce_number(hours, lang) + " " + if hours == 1: + out += _translate_word("hour", lang) + else: + out += _translate_word("hours", lang) + if minutes > 0: + if out: + out += " " + out += pronounce_number(minutes, lang) + " " + if minutes == 1: + out += _translate_word("minute", lang) + else: + out += _translate_word("minutes", lang) + if seconds > 0: + if out: + out += " " + out += pronounce_number(seconds, lang) + " " + if seconds == 1: + out += _translate_word("second", lang) + else: + out += _translate_word("seconds", lang) + else: + # M:SS, MM:SS, H:MM:SS, Dd H:MM:SS format + out = "" + if days > 0: + out = str(days) + "d " + if hours > 0 or days > 0: + out += str(hours) + ":" + if minutes < 10 and (hours > 0 or days > 0): + out += "0" + out += str(minutes) + ":" + if seconds < 10: + out += "0" + out += str(seconds) + + return out + + +def join_list(items, connector, sep=None, lang=''): + """ Join a list into a phrase using the given connector word + + Examples: + join_list([1,2,3], "and") -> "1, 2 and 3" + join_list([1,2,3], "and", ";") -> "1; 2 and 3" + + Args: + items (array): items to be joined + connector (str): connecting word (resource name), like "and" or "or" + sep (str, optional): separator character, default = "," + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + str: the connected list phrase + """ + + if not items: + return "" + if len(items) == 1: + return str(items[0]) + + if not sep: + sep = ", " + else: + sep += " " + return (sep.join(str(item) for item in items[:-1]) + + " " + _translate_word(connector, lang) + + " " + items[-1]) + + +def expand_parentheses(sent): + """ + ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] + For example: + Will it (rain|pour) (today|tomorrow|)? + ----> + Will it rain today? + Will it rain tomorrow? + Will it rain? + Will it pour today? + Will it pour tomorrow? + Will it pour? + + Args: + sent (list): List of tokens in sentence + + Returns: + list>: Multiple possible sentences from original + """ + return SentenceTreeParser(sent).expand_parentheses() + + +def expand_options(parentheses_line: str) -> list: + """ + Convert 'test (a|b)' -> ['test a', 'test b'] + + Args: + parentheses_line: Input line to expand + + Returns: + List of expanded possibilities + """ + # 'a(this|that)b' -> [['a', 'this', 'b'], ['a', 'that', 'b']] + options = expand_parentheses(re.split(r'([(|)])', parentheses_line)) + return [re.sub(r'\s+', ' ', ' '.join(i)).strip() for i in options] + + +@localized_function() +def nice_response(text, lang=''): + """ + In some languages, sanitizes certain numeric input for TTS + + Most of the time, this function will be called by any formatters + which might need it. It's exposed here just in case you've got a clever + use. + + As of July 2020, this function sanitizes some dates and "x ^ y"-formatted + exponents in the following primary language codes: + da de nl sv + + Args: + text (str): input text to sanitize + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + + Example: + assertEqual(nice_response_de("dies ist der 31. mai"), + "dies ist der einunddreißigste mai") + + assertEqual(nice_response_de("10 ^ 2"), + "10 hoch 2") + """ diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py new file mode 100644 index 0000000..bb2e04a --- /dev/null +++ b/lingua_franca/internal.py @@ -0,0 +1,774 @@ +import os.path +from functools import wraps +from importlib import import_module +from inspect import signature + +from warnings import warn +from datetime import datetime +from lingua_franca import config +from lingua_franca.time import to_local + + +_SUPPORTED_LANGUAGES = ("ca", "cs", "da", "de", "en", "es", "fr", "hu", + "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa") + +_SUPPORTED_FULL_LOCALIZATIONS = ("ca-es", "cs-cz", "da-dk", "de-de", + "en-au", "en-us", "es-es", "fr-fr", + "hu-hu", "it-it", "nl-nl", "pl-pl", + "fa-ir", "pt-pt", "ru-ru", "sl-si", + "sv-se", "tr-tr") + +_DEFAULT_FULL_LANG_CODES = {'ca': 'ca-es', + 'cs': 'cs-cz', + 'da': 'da-dk', + 'de': 'de-de', + 'en': 'en-us', + 'es': 'es-es', + 'fa': 'fa-ir', + 'fr': 'fr-fr', + 'hu': 'hu-hu', + 'it': 'it-it', + 'nl': 'nl-nl', + 'pl': 'pl-pl', + 'pt': 'pt-pt', + 'ru': 'ru-ru', + 'sl': 'sl-si', + 'sv': 'sv-se', + 'tr': 'tr-tr'} + +__default_lang = None +__active_lang_code = None +__loaded_langs = [] + +_localized_functions = {} + +# TODO the deprecation of 'lang=None' and 'lang=' can refer to +# commit 35efd0661a178e82f6745ad17e10e607c0d83472 for the "proper" state +# of affairs, raising the errors below instead of deprecation warnings + +# Once the deprecation is complete, functions which have had their default +# parameter changed from lang=None to lang='' should be switched back + + +class UnsupportedLanguageError(NotImplementedError): + pass + + +class FunctionNotLocalizedError(NotImplementedError): + pass + + +NoneLangWarning = \ + DeprecationWarning("Lingua Franca is dropping support" + " for 'lang=None' as an explicit" + " argument.") +InvalidLangWarning = \ + DeprecationWarning("Invalid language code detected. Falling back on " + "default.\nThis behavior is deprecated. The 'lang' " + "parameter is optional, and only accepts supported " + "language codes, beginning with Lingua Franca 0.3.0") + + +def _raise_unsupported_language(language): + """ + Raise an error when a language is unsupported + + Arguments: + language: str + The language that was supplied. + """ + supported = ' '.join(_SUPPORTED_LANGUAGES) + raise UnsupportedLanguageError("\nLanguage '{language}' is not yet " + "supported by Lingua Franca. " + "Supported language codes " + "include the following:\n{supported}" + .format(language=language, supported=supported)) + + +def get_supported_langs(): + """ + Returns: + list(str) + """ + return _SUPPORTED_LANGUAGES + + +def get_active_langs(): + """ Get the list of currently-loaded language codes + + Returns: + list(str) + """ + return __loaded_langs + + +def _set_active_langs(langs=None, override_default=True): + """ Set the list of languages to load. + Unloads previously-loaded languages which are not specified here. + If the input list does not contain the current default language, + langs[0] will become the new default language. This behavior + can be overridden. + + Arguments: + langs: {list(str) or str} -- a list of language codes to load + + Keyword Arguments: + override_default (bool) -- Change default language to first entry if + the current default is no longer present + (default: True) + """ + if isinstance(langs, str): + langs = [langs] + if not isinstance(langs, list): + raise(TypeError("lingua_franca.internal._set_active_langs expects" + " 'str' or 'list'")) + global __loaded_langs, __default_lang + __loaded_langs = list(dict.fromkeys(langs)) + if __default_lang: + if override_default or get_primary_lang_code(__default_lang) \ + not in __loaded_langs: + if len(__loaded_langs): + set_default_lang(get_full_lang_code(__loaded_langs[0])) + else: + __default_lang = None + _refresh_function_dict() + + +def _refresh_function_dict(): + for mod in _localized_functions.keys(): + populate_localized_function_dict(mod, langs=__loaded_langs) + + +def is_supported_lang(lang): + try: + return lang.lower() in _SUPPORTED_LANGUAGES + except AttributeError: + return False + + +def is_supported_full_lang(lang): + """ + Arguments: + lang (str): a full language code, such as "en-US" (case insensitive) + + Returns: + bool - does Lingua Franca support this language code? + """ + try: + return lang.lower() in _SUPPORTED_FULL_LOCALIZATIONS + except AttributeError: + return False + + +def load_language(lang): + """Load `lang` and its functions into memory. Will only import those + functions which belong to a loaded module. In other words, if you have + lingua_franca.parse loaded, but *not* lingua_franca.format, + running `load_language('es') will only import the Spanish-language + parsers, and not the formatters. + + The reverse is also true: importing a module, such as + `import lingua_franca.parse`, will only import those functions + which belong to currently-loaded languages. + + Arguments: + lang (str): the language code to load (any supported lang code, + whether 'primary' or 'full') + Case-insensitive. + """ + if not isinstance(lang, str): + raise TypeError("lingua_franca.load_language expects 'str' " + "(got " + type(lang) + ")") + if lang not in _SUPPORTED_LANGUAGES: + if lang in _SUPPORTED_FULL_LOCALIZATIONS: + lang = get_primary_lang_code(lang) + if lang not in __loaded_langs: + __loaded_langs.append(lang) + if not __default_lang: + set_default_lang(lang) + _set_active_langs(__loaded_langs) + + +def load_languages(langs): + """Load multiple languages at once + Simple for loop using load_language() + + Args: + langs (list[str]) + """ + for lang in langs: + load_language(lang) + + +def unload_language(lang): + """Opposite of load_language() + Unloading the default causes the next language in + `lingua_franca.get_active_langs()` to become the default. + + Will not stop you from unloading the last language, as this may be + desirable for some applications. + + Args: + lang (str): language code to unload + """ + if lang in __loaded_langs: + __loaded_langs.remove(lang) + _set_active_langs(__loaded_langs) + + +def unload_languages(langs): + """Opposite of load_languages() + Simple for loop using unload_language() + + Args: + langs (list[str]) + """ + for lang in langs: + __loaded_langs.remove(lang) + _set_active_langs(__loaded_langs) + + +def get_default_lang(): + """ Return the current default language. + This returns the active BCP-47 code, such as 'en' or 'es'. + For the current localization/full language code, + such as 'en-US' or 'es-ES', call `get_default_loc()` + + See: + https://en.wikipedia.org/wiki/IETF_language_tag + + Returns: + str: A primary language code, e.g. ("en", or "pt") + """ + return __default_lang + + +def get_default_loc(): + """ Return the current, localized BCP-47 language code, such as 'en-US' + or 'es-ES'. For the default language *family* - which is passed to + most parsers and formatters - call `get_default_lang` + + The 'localized' portion conforms to ISO 3166-1 alpha-2 + https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 + """ + return __active_lang_code + + +def set_default_lang(lang_code): + """ Set the active BCP-47 language code to be used in formatting/parsing + Will choose a default localization if passed a primary language family + (ex: `set_default_lang("en")` will default to "en-US") + + Will respect localization when passed a full lang code. + + For more information about valid lang codes, see get_default_lang() + and get_default_loc() + + Args: + lang(str): BCP-47 language code, e.g. "en-us" or "es-mx" + """ + global __default_lang, __active_lang_code + + lang_code = lang_code.lower() + primary_lang_code = get_primary_lang_code(lang_code) + if primary_lang_code not in _SUPPORTED_LANGUAGES: + _raise_unsupported_language(lang_code) + else: + __default_lang = primary_lang_code + + # make sure the default language is loaded. + # also make sure the default language is at the front. + # position doesn't matter here, but it clarifies things while debugging. + if __default_lang in __loaded_langs: + __loaded_langs.remove(__default_lang) + __loaded_langs.insert(0, __default_lang) + _refresh_function_dict() + + if is_supported_full_lang(lang_code): + __active_lang_code = lang_code + else: + __active_lang_code = get_full_lang_code(__default_lang) + +# TODO remove this when invalid lang codes are removed (currently deprecated) + + +def get_primary_lang_code(lang=''): + if not lang: + if lang is None: + warn(NoneLangWarning) + lang = get_default_loc() + # if not (lang): + try: + lang = __get_primary_lang_code_deprecation_warning(lang) + except UnsupportedLanguageError: + warn(InvalidLangWarning) + lang = get_default_loc() + return lang + + +def __get_primary_lang_code_deprecation_warning(lang=''): + """ Get the primary language code + + Args: + lang(str, optional): A BCP-47 language code + (If omitted, equivalent to + `lingua_franca.get_default_lang()`) + + Returns: + str: A primary language family, such as "en", "de" or "pt" + """ + # split on the hyphen and only return the primary-language code + # NOTE: This is typically a two character code. The standard allows + # 1, 2, 3 and 4 character codes. In the future we can consider + # mapping from the 3 to 2 character codes, for example. But for + # now we can just be careful in use. + if not lang: + return get_default_lang() + elif not isinstance(lang, str): + raise(TypeError("lingua_franca.get_primary_lang_code() expects" + " an (optional)argument of type 'str', but got " + + type(lang))) + else: + lang_code = lang.lower() + if lang_code not in _SUPPORTED_FULL_LOCALIZATIONS and lang_code not in \ + _SUPPORTED_LANGUAGES: + # We don't know this language code. Check if the input is + # formatted like a language code. + if lang == (("-".join([lang[:2], lang[3:]]) or None)): + warn("Unrecognized language code: '" + lang + "', but it appears " + "to be a valid language code. Returning the first two chars.") + return lang_code.split("-")[0] + else: + raise(ValueError("Invalid input: " + lang)) + return lang_code.split("-")[0] + +# TODO remove this when invalid lang codes are removed (currently deprecated) + + +def get_full_lang_code(lang=''): + if not lang: + if lang is None: + warn(NoneLangWarning) + lang = get_default_loc() + if not is_supported_full_lang(lang): + try: + lang = __get_full_lang_code_deprecation_warning(lang) + except UnsupportedLanguageError: + warn(InvalidLangWarning) + lang = get_default_loc() + return lang + + +def __get_full_lang_code_deprecation_warning(lang=''): + """ Get the full language code + + Args: + lang(str, optional): A BCP-47 language code + (if omitted, equivalent to + `lingua_franca.get_default_loc()`) + + Returns: + str: A full language code, such as "en-us" or "de-de" + """ + if lang is None: + return __active_lang_code.lower() + elif not isinstance(lang, str): + raise TypeError("get_full_lang_code expects str, " + "got {}".format(type(lang))) + if lang.lower() in _SUPPORTED_FULL_LOCALIZATIONS: + return lang + elif lang in _DEFAULT_FULL_LANG_CODES: + return _DEFAULT_FULL_LANG_CODES[lang] + else: + raise UnsupportedLanguageError(lang) + + +def localized_function(run_own_code_on=[type(None)]): + """ + Decorator which finds localized functions, and calls them, from signatures + defined in the top-level modules. See lingua_franca.format or .parse for + examples of the decorator in action. + + Note that, by default, wrapped functions will never actually be executed. + Rather, when they're called, their arguments will be passed directly to + their localized equivalent, specified by the 'lang' parameter. + + The wrapper can be instructed to execute the wrapped function itself when + a specified error is raised (see the argument 'run_own_code_on') + + For instance, this decorator wraps parse.extract_number(), which has no + logic of its own. A call to + + extract_number('uno', lang='es') + + will locate and call + + lingua_franca.lang.parse_es.extract_number_es('uno') + + By contrast, here's the decorator above format.nice_number, with the param: + + @localized_function(run_own_code_on=[UnsupportedLanguageError]) + def nice_number(number, lang='', speech=True, denominators=None): + + Here, nice_number() itself will be executed in the event that the localizer + raises an UnsupportedLanguageError. + + Arguments: + run_own_code_on(list(type), optional) + A list of Error types (ValueError, NotImplementedError, etc) + which, if they are raised, will trigger the wrapped function's + own code. + + If this argument is omitted, the function itself will never + be run. Calls to the wrapped function will be passed to the + appropriate, localized function. + + + """ + # Make sure everything in run_own_code_on is an Error or None + BadTypeError = \ + ValueError("@localized_function(run_own_code_on=<>) expected an " + "Error type, or a list of Error types. Instead, it " + "received this value:\n" + str(run_own_code_on)) + # TODO deprecate these kwarg values 6-12 months after v0.3.0 releases + + if run_own_code_on != [None]: + def is_error_type(_type): + if not callable(_type): + return False + _instance = _type() + rval = isinstance(_instance, BaseException) if _instance else True + del _instance + return rval + if not isinstance(run_own_code_on, list): + try: + run_own_code_on = list(run_own_code_on) + except TypeError: + raise BadTypeError + if not all((is_error_type(e) for e in run_own_code_on)): + raise BadTypeError + + # Begin wrapper + def localized_function_decorator(func): + # Wrapper's logic + def _call_localized_function(func, *args, **kwargs): + lang_code = None + load_langs_on_demand = config.load_langs_on_demand + unload_language_afterward = False + func_signature = signature(func) + func_params = list(func_signature.parameters) + lang_param_index = func_params.index('lang') + full_lang_code = None + + # Check if we need to add timezone awareness to any datetime object + if config.inject_timezones: + for key, value in kwargs.items(): + if isinstance(value, datetime) and value.tzinfo is None: + kwargs[key] = to_local(value) + for idx, value in enumerate(args): + if isinstance(value, datetime) and value.tzinfo is None: + args = (*args[:idx], to_local(value), *args[idx + 1:]) + + # Check if we're passing a lang as a kwarg + if 'lang' in kwargs.keys(): + lang_param = kwargs['lang'] + if lang_param is None: + warn(NoneLangWarning) + lang_code = get_default_lang() + else: + lang_code = lang_param + + # Check if we're passing a lang as a positional arg + elif lang_param_index < len(args): + lang_param = args[lang_param_index] + if lang_param is None: + warn(NoneLangWarning) + lang_code = get_default_lang() + elif lang_param in _SUPPORTED_LANGUAGES or \ + lang_param in _SUPPORTED_FULL_LOCALIZATIONS: + lang_code = args[lang_param_index] + args = args[:lang_param_index] + args[lang_param_index+1:] + + # Turns out, we aren't passing a lang code at all + lang_code = lang_code or get_default_lang() + if not lang_code: + if load_langs_on_demand: + raise ModuleNotFoundError("No language module loaded " + "and none specified.") + else: + raise ModuleNotFoundError("No language module loaded.") + + if lang_code not in _SUPPORTED_LANGUAGES: + try: + tmp = lang_code + __use_tmp = True + lang_code = get_primary_lang_code(lang_code) + except ValueError: + __error = \ + UnsupportedLanguageError("\nLanguage '{language}' is not yet " + "supported by Lingua Franca. " + "Supported language codes " + "include the following:\n{supported}" + .format( + language=lang_code, + supported=_SUPPORTED_FULL_LOCALIZATIONS)) + if UnsupportedLanguageError in run_own_code_on: + raise __error + else: + warn(DeprecationWarning("The following warning will " + "become an exception in a future " + "version of Lingua Franca." + + str(__error))) + lang_code = get_default_lang() + full_lang_code = get_full_lang_code() + __use_tmp = False + if lang_code not in _SUPPORTED_LANGUAGES: + _raise_unsupported_language(lang_code) + if __use_tmp: + full_lang_code = tmp + else: + full_lang_code = get_full_lang_code(lang_code) + + # Here comes the ugly business. + _module_name = func.__module__.split('.')[-1] + _module = import_module(".lang." + _module_name + + "_" + lang_code, "lingua_franca") + # The nonsense above gets you from lingua_franca.parse + # to lingua_franca.lang.parse_xx + if _module_name not in _localized_functions.keys(): + raise ModuleNotFoundError("Module lingua_franca." + + _module_name + " not recognized") + if lang_code not in _localized_functions[_module_name].keys(): + if load_langs_on_demand: + load_language(lang_code) + unload_language_afterward = True + else: + raise ModuleNotFoundError(_module_name + + " module of language '" + + lang_code + + "' is not currently loaded.") + func_name = func.__name__.split('.')[-1] + # At some point in the past, both the module and the language + # were imported/loaded, respectively. + # When that happened, we cached the *signature* of each + # localized function. + # + # This is the crucial element that allows us to import funcs + # on the fly. + # + # If we didn't find a localized function to correspond with + # the wrapped function, we cached NotImplementedError in its + # place. + loc_signature = _localized_functions[_module_name][lang_code][func_name] + if isinstance(loc_signature, type(NotImplementedError())): + raise loc_signature + + # Now we have the appropriate localized module. Let's get + # the localized function. + try: + localized_func = getattr( + _module, func_name + "_" + lang_code) + except AttributeError: + raise FunctionNotLocalizedError(func_name, lang_code) + + # We now have a localized function, such as + # lingua_franca.parse.extract_datetime_en + # Get 'lang' out of its parameters. + if 'lang' in kwargs: + del kwargs['lang'] + args = tuple(arg for arg in list(args) if + arg not in (lang_code, full_lang_code)) + + # Now we call the function, ignoring any kwargs from the + # wrapped function that aren't in the localized function. + r_val = localized_func(*args, + **{arg: val for arg, val + in kwargs.items() + if arg in loc_signature.parameters}) + + # Unload all the stuff we just assembled and imported + del localized_func + del _module + if unload_language_afterward: + unload_language(lang_code) + return r_val + + # Actual wrapper + @wraps(func) + def call_localized_function(*args, **kwargs): + if run_own_code_on != [type(None)]: + try: + return _call_localized_function(func, *args, **kwargs) + except Exception as e: # Intercept, check for run_own_code_on + if any((isinstance(e, error) for error in run_own_code_on)): + return func(*args, **kwargs) + else: + raise e + else: # don't intercept any exceptions + return _call_localized_function(func, *args, **kwargs) + return call_localized_function + try: + return localized_function_decorator + except NotImplementedError as e: + warn(str(e)) + return + + +def populate_localized_function_dict(lf_module, langs=get_active_langs()): + """Returns a dictionary of dictionaries, containing localized functions. + + Used by the top-level modules to locate, cache, and call localized funcs. + + Arguments: + lf_module(str) - - the name of the top-level module + + Returns: + Dict - - {language_code: {function_name(str): function}} + + Note: + The dictionary returned can be used directly, + but it's normally discarded. Rather, this function will create + the dictionary as a member of + `lingua_franca.internal._localized_functions`, + and its members are invoked via the `@localized_function` decorator. + + Example: + populate_localized_function_dict("format")["en"]["pronounce_number"](1) + "one" + """ + bad_lang_code = "Language code '{}' is registered with" \ + " Lingua Franca, but its " + lf_module + " module" \ + " could not be found." + return_dict = {} + for lang_code in langs: + primary_lang_code = get_primary_lang_code(lang_code) + return_dict[primary_lang_code] = {} + _FUNCTION_NOT_FOUND = "" + try: + lang_common_data = import_module(".lang.common_data_" + primary_lang_code, + "lingua_franca") + _FUNCTION_NOT_FOUND = getattr(lang_common_data, + "_FUNCTION_NOT_IMPLEMENTED_WARNING") + del lang_common_data + except Exception: + _FUNCTION_NOT_FOUND = "This function has not been implemented" \ + " in the specified language." + _FUNCTION_NOT_FOUND = FunctionNotLocalizedError(_FUNCTION_NOT_FOUND) + + try: + mod = import_module(".lang." + lf_module + "_" + primary_lang_code, + "lingua_franca") + except ModuleNotFoundError: + warn(Warning(bad_lang_code.format(primary_lang_code))) + continue + + function_names = getattr(import_module("." + lf_module, "lingua_franca"), + "_REGISTERED_FUNCTIONS") + for function_name in function_names: + try: + function = getattr(mod, function_name + + "_" + primary_lang_code) + function_signature = signature(function) + del function + except AttributeError: + function_signature = _FUNCTION_NOT_FOUND + # TODO log these occurrences: "function 'function_name' not + # implemented in language 'primary_lang_code'" + # + # Perhaps provide this info to autodocs, to help volunteers + # identify the functions in need of localization + return_dict[primary_lang_code][function_name] = function_signature + + del mod + _localized_functions[lf_module] = return_dict + return _localized_functions[lf_module] + + +def resolve_resource_file(res_name, data_dir=None): + """Convert a resource into an absolute filename. + + Resource names are in the form: 'filename.ext' + or 'path/filename.ext' + + The system wil look for ~/.mycroft/res_name first, and + if not found will look at / opt/mycroft/res_name, + then finally it will look for res_name in the 'mycroft/res' + folder of the source code package. + + Example: + With mycroft running as the user 'bob', if you called + resolve_resource_file('snd/beep.wav') + it would return either '/home/bob/.mycroft/snd/beep.wav' or + '/opt/mycroft/snd/beep.wav' or '.../mycroft/res/snd/beep.wav', + where the '...' is replaced by the path where the package has + been installed. + + Args: + res_name(str): a resource path/name + Returns: + str: path to resource or None if no resource found + """ + # First look for fully qualified file (e.g. a user setting) + if os.path.isfile(res_name): + return res_name + + # Now look for ~/.mycroft/res_name (in user folder) + filename = os.path.expanduser("~/.mycroft/" + res_name) + if os.path.isfile(filename): + return filename + + # Next look for /opt/mycroft/res/res_name + data_dir = data_dir or os.path.expanduser("/opt/mycroft/res/") + filename = os.path.expanduser(os.path.join(data_dir, res_name)) + if os.path.isfile(filename): + return filename + + # Finally look for it in the source package + filename = os.path.join(os.path.dirname(__file__), 'res', res_name) + filename = os.path.abspath(os.path.normpath(filename)) + if os.path.isfile(filename): + return filename + + return None # Resource cannot be resolved + + +def lookup_variant(mappings, key="variant"): + """function decorator + maps strings to Enums expected by language specific functions + mappings can be used to translate values read from configuration files + + Example usage: + + @lookup_variant({ + "default": TimeVariant.DEFAULT, + "traditional": TimeVariant.TRADITIONAL + }) + def nice_time_XX(dt, speech=True, use_24hour=False, use_ampm=False, + variant=None): + variant = variant or TimeVariant.DEFAULT + (...) + + """ + if not isinstance(mappings, dict): + raise ValueError + + # Begin wrapper + def lang_variant_function_decorator(func): + + @wraps(func) + def call_function(*args, **kwargs): + if key in kwargs and isinstance(kwargs[key], str): + if kwargs[key] in mappings: + kwargs[key] = mappings[kwargs[key]] + else: + raise ValueError("Unknown variant, mapping does not " + "exist for {v}".format(v=key)) + return func(*args, **kwargs) + + return call_function + + try: + return lang_variant_function_decorator + except NotImplementedError as e: + warn(str(e)) + return diff --git a/lingua_franca/lang/__init__.py b/lingua_franca/lang/__init__.py new file mode 100644 index 0000000..a82a87d --- /dev/null +++ b/lingua_franca/lang/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from warnings import warn +from lingua_franca.internal import get_default_lang, \ + set_default_lang, get_primary_lang_code as gplc, get_full_lang_code as gflc + + +def get_active_lang(): + """ Get the active full language code (BCP-47) + + Returns: + str: A BCP-47 language code, e.g. ("en-us", or "pt-pt") + """ + _getlang = "Direct imports from lingua_franca.lang" + " have been deprecated. Use" + " lingua_franca.get_default_lang()" + warn(_getlang, DeprecationWarning) + return get_default_lang() + + +def set_active_lang(lang_code): + """ Set the active BCP-47 language code to be used in formatting/parsing + + Args: + lang (str): BCP-47 language code, e.g. "en-us" or "es-mx" + """ + _setlang = "Direct imports from lingua_franca.lang" + " have been deprecated. Use" + " lingua_franca.set_default_lang()" + warn(_setlang, DeprecationWarning) + set_default_lang(lang_code=lang_code) + + +def get_primary_lang_code(lang=None): + """ Get the primary language code + + Args: + lang (str, optional): A BCP-47 language code, or None for default + + Returns: + str: A primary language family, such as "en", "de" or "pt" + """ + warn("Direct imports from lingua_franca.lang have been deprecated. Use" + " lingua_franca.get_primary_lang_code()", DeprecationWarning) + return gplc(lang=lang) + + +def get_full_lang_code(lang=None): + """ Get the full language code + + Args: + lang (str, optional): A BCP-47 language code, or None for default + + Returns: + str: A full language code, such as "en-us" or "de-de" + """ + warn("Direct imports from lingua_franca.lang have been deprecated. Use" + " lingua_franca.get_full_lang_code()", DeprecationWarning) + return gflc(lang=lang) diff --git a/lingua_franca/lang/common_data_ca.py b/lingua_franca/lang/common_data_ca.py new file mode 100644 index 0000000..e9d721a --- /dev/null +++ b/lingua_franca/lang/common_data_ca.py @@ -0,0 +1,197 @@ +_FUNCTION_NOT_IMPLEMENTED_WARNING = "aquesta funció encara no s'ha implementat en 'ca'" + +# Undefined articles ["un", "una", "uns", "unes"] can not be supressed, +# in CA, "un cavall" means "a horse" or "one horse". + +_ARTICLES_CA = ["el", "la", "l", "lo", "els", "les", "los"] + +# word rules for gender +_FEMALE_ENDINGS_CA = ["a", "esa", "essa", "esses", "eses", "ena", "enes", + "ques", "asi", "esi", "isi", "osi", "ut", "at", + "eta", "etes", "tja", "tges", "ica", "iques", + "ada", "ades"] +_MALE_ENDINGS_CA = ["o", "os", "ll", "lls", "ig", "igs", "itjos", "rs", + "et", "ets", "ès", "ns", "ic", "ics", "at", "ats"] + +# special cases, word lookup for words not covered by above rule +_GENDERS_CA = { + "dones": "f", + "home": "m", + "pell": "f", + "pells": "f" +} + +# context rules for gender +_MALE_DETERMINANTS_CA = ["el", "els", "l", "lo", "es", "aquest", "aquests", + "aquell", "aquells", "aqueix", "aqueixos", + "algun", "alguns", "este", "estos", "altre", + "mon", "mos", "mons", "meus", "meus"] +_FEMALE_DETERMINANTS_CA = ["la", "les", "sa", "ses", "aquesta", "aquestes", + "aquella", "aquelles", "aqueixa", "aqueixes", + "alguna", "algunes", "esta", "estes", "altra", + "ma", "mes", "meva", "meua", "meves"] + +_NUMBERS_CA = { + "zero": 0, + "u": 1, + "un": 1, + "una": 1, + "uns": 1, + "unes": 1, + "primer": 1, + "primera": 1, + "segon": 2, + "segona": 2, + "tercer": 3, + "tercera": 3, + "dos": 2, + "dues": 2, + "tres": 3, + "quatre": 4, + "cinc": 5, + "sis": 6, + "set": 7, + "vuit": 8, + "huit": 8, + "nou": 9, + "deu": 10, + "onze": 11, + "dotze": 12, + "tretze": 13, + "catorze": 14, + "quinze": 15, + "setze": 16, + "disset": 17, + "divuit": 18, + "dinou": 19, + "vint": 20, + "trenta": 30, + "quaranta": 40, + "cinquanta": 50, + "seixanta": 60, + "setanta": 70, + "vuitanta": 80, + "noranta": 90, + "cent": 100, + "cents": 100, + "dos-cents": 200, + "dues-centes": 200, + "tres-cents": 300, + "tres-centes": 300, + "quatre-cents": 400, + "quatre-centes": 400, + "cinc-cents": 500, + "cinc-centes": 500, + "sis-cents": 600, + "sis-centes": 600, + "set--cents": 700, + "set-centes": 700, + "vuit-cents": 800, + "vuit-centes": 800, + "nou-cents": 900, + "nou-centes": 900, + "mil": 1000, + "milió": 1000000 +} + +_FRACTION_STRING_CA = { + 2: 'mig', + 3: 'terç', + 4: 'quart', + 5: 'cinquè', + 6: 'sisè', + 7: 'setè', + 8: 'vuitè', + 9: 'novè', + 10: 'desè', + 11: 'onzè', + 12: 'dotzè', + 13: 'tretzè', + 14: 'catorzè', + 15: 'quinzè', + 16: 'setzè', + 17: 'dissetè', + 18: 'divuitè', + 19: 'dinovè', + 20: 'vintè', + 30: 'trentè', + 100: 'centè', + 1000: 'milè' +} + +_NUM_STRING_CA = { + 0: 'zero', + 1: 'un', + 2: 'dos', + 3: 'tres', + 4: 'quatre', + 5: 'cinc', + 6: 'sis', + 7: 'set', + 8: 'vuit', + 9: 'nou', + 10: 'deu', + 11: 'onze', + 12: 'dotze', + 13: 'tretze', + 14: 'catorze', + 15: 'quinze', + 16: 'setze', + 17: 'disset', + 18: 'divuit', + 19: 'dinou', + 20: 'vint', + 30: 'trenta', + 40: 'quaranta', + 50: 'cinquanta', + 60: 'seixanta', + 70: 'setanta', + 80: 'vuitanta', + 90: 'noranta' +} + +_TENS_CA = { + "vint": 20, + "trenta": 30, + "quaranta": 40, + "cinquanta": 50, + "seixanta": 60, + "setanta": 70, + "vuitanta": 80, + "huitanta": 80, + "noranta": 90 +} + +_AFTER_TENS_CA = { + "u": 1, + "un": 1, + "dos": 2, + "dues": 2, + "tres": 3, + "quatre": 4, + "cinc": 5, + "sis": 6, + "set": 7, + "vuit": 8, + "huit": 8, + "nou": 9 +} + +_BEFORE_HUNDREDS_CA = { + "dos": 2, + "dues": 2, + "tres": 3, + "quatre": 4, + "cinc": 5, + "sis": 6, + "set": 7, + "vuit": 8, + "huit": 8, + "nou": 9, +} + +_HUNDREDS_CA = { + "cent": 100, + "cents": 100, + "centes": 100 +} diff --git a/lingua_franca/lang/common_data_cs.py b/lingua_franca/lang/common_data_cs.py new file mode 100644 index 0000000..dbaf62c --- /dev/null +++ b/lingua_franca/lang/common_data_cs.py @@ -0,0 +1,305 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +#_ARTICLES_CS = {} + + +_NUM_STRING_CS = { + 0: 'nula', + 1: 'jedna', + 2: 'dva', + 3: 'tři', + 4: 'čtyři', + 5: 'pět', + 6: 'šest', + 7: 'sedm', + 8: 'osm', + 9: 'devět', + 10: 'deset', + 11: 'jedenáct', + 12: 'dvanáct', + 13: 'třináct', + 14: 'čtrnáct', + 15: 'patnáct', + 16: 'šestnáct', + 17: 'sedmnáct', + 18: 'osmnáct', + 19: 'devatenáct', + 20: 'dvacet', + 30: 'třicet', + 40: 'čtyřicet', + 50: 'padesát', + 60: 'šedesát', + 70: 'sedmdesát', + 80: 'osmdesát', + 90: 'devadesát' +} + + +_FRACTION_STRING_CS = { + 2: 'polovina', + 3: 'třetina', + 4: 'čtvrtina', + 5: 'pětina', + 6: 'šestina', + 7: 'sedmina', + 8: 'osmina', + 9: 'devítina', + 10: 'desetina', + 11: 'jedenáctina', + 12: 'dvanáctina', + 13: 'třináctina', + 14: 'čtrnáctina', + 15: 'patnáctina', + 16: 'šestnáctina', + 17: 'sedmnáctina', + 18: 'osmnáctina', + 19: 'devatenáctina', + 20: 'dvacetina', + 30: 'třicetina', + 40: 'čtyřicetina', + 50: 'padesátina', + 60: 'šedesátina', + 70: 'sedmdesátina', + 80: 'osmdesátina', + 90: 'devadesátina', + 1e2: 'setina', + 1e3: 'tisícina' +} + + +_LONG_SCALE_CS = OrderedDict([ + (100, 'sto'), + (1000, 'tisíc'), + (1000000, 'milion'), + (1e9, "miliarda"), + (1e12, "bilion"), + (1e15, "biliarda"), + (1e18, "trilion"), + (1e21, "triliarda"), + (1e24, "kvadrilion"), + (1e27, "kvadriliarda"), + (1e30, "kvintilion"), + (1e33, "kvintiliarda"), + (1e36, "sextilion"), + (1e39, "sextiliarda"), + (1e42, "septilion"), + (1e45, "septiliarda"), + (1e48, "oktilion"), + (1e51, "oktiliarda"), + (1e54, "nonilion"), + (1e57, "noniliarda"), + (1e60, "decilion"), + (1e63, "deciliarda"), + (1e120, "vigintilion"), + (1e180, "trigintilion"), + (1e303, "kvinkvagintiliarda"), + (1e600, "centilion"), + (1e603, "centiliarda") +]) + + +_SHORT_SCALE_CS = OrderedDict([ + (100, 'sto'), + (1000, 'tisíc'), + (1000000, 'million'), + (1e9, "billion"), + (1e12, 'trillion'), + (1e15, "quadrillion"), + (1e18, "quintillion"), + (1e21, "sextillion"), + (1e24, "septillion"), + (1e27, "octillion"), + (1e30, "nonillion"), + (1e33, "decillion"), + (1e36, "undecillion"), + (1e39, "duodecillion"), + (1e42, "tredecillion"), + (1e45, "quadrdecillion"), + (1e48, "quindecillion"), + (1e51, "sexdecillion"), + (1e54, "septendecillion"), + (1e57, "octodecillion"), + (1e60, "novemdecillion"), + (1e63, "vigintillion"), + (1e66, "unvigintillion"), + (1e69, "uuovigintillion"), + (1e72, "tresvigintillion"), + (1e75, "quattuorvigintillion"), + (1e78, "quinquavigintillion"), + (1e81, "qesvigintillion"), + (1e84, "septemvigintillion"), + (1e87, "octovigintillion"), + (1e90, "novemvigintillion"), + (1e93, "trigintillion"), + (1e96, "untrigintillion"), + (1e99, "duotrigintillion"), + (1e102, "trestrigintillion"), + (1e105, "quattuortrigintillion"), + (1e108, "quinquatrigintillion"), + (1e111, "sestrigintillion"), + (1e114, "septentrigintillion"), + (1e117, "octotrigintillion"), + (1e120, "noventrigintillion"), + (1e123, "quadragintillion"), + (1e153, "quinquagintillion"), + (1e183, "sexagintillion"), + (1e213, "septuagintillion"), + (1e243, "octogintillion"), + (1e273, "nonagintillion"), + (1e303, "centillion"), + (1e306, "uncentillion"), + (1e309, "duocentillion"), + (1e312, "trescentillion"), + (1e333, "decicentillion"), + (1e336, "undecicentillion"), + (1e363, "viginticentillion"), + (1e366, "unviginticentillion"), + (1e393, "trigintacentillion"), + (1e423, "quadragintacentillion"), + (1e453, "quinquagintacentillion"), + (1e483, "sexagintacentillion"), + (1e513, "septuagintacentillion"), + (1e543, "ctogintacentillion"), + (1e573, "nonagintacentillion"), + (1e603, "ducentillion"), + (1e903, "trecentillion"), + (1e1203, "quadringentillion"), + (1e1503, "quingentillion"), + (1e1803, "sescentillion"), + (1e2103, "septingentillion"), + (1e2403, "octingentillion"), + (1e2703, "nongentillion"), + (1e3003, "millinillion") +]) + + +_ORDINAL_BASE_CS = { + 1: 'první', + 2: 'druhý', + 3: 'třetí', + 4: 'čtvrtý', + 5: 'pátý', + 6: 'šestý', + 7: 'sedmý', + 8: 'osmý', + 9: 'devátý', + 10: 'desátý', + 11: 'jedenáctý', + 12: 'dvanáctý', + 13: 'třináctý', + 14: 'čtrnáctý', + 15: 'patnáctý', + 16: 'šestnáctý', + 17: 'sedmnáctý', + 18: 'osmnáctý', + 19: 'devatenáctý', + 20: 'dvacátý', + 30: 'třicátý', + 40: "čtyřicátý", + 50: "padesátý", + 60: "šedesátý", + 70: "sedmdesátý", + 80: "osmdesátý", + 90: "devadesátý", + 1e2: "stý", + 1e3: "tisící" +} + + +_SHORT_ORDINAL_CS = { + 1e6: "miliontý", + 1e9: "billiontý", + 1e12: "trilliontý", + 1e15: "quadrilliontý", + 1e18: "quintilliontý", + 1e21: "sextilliontý", + 1e24: "septilliontý", + 1e27: "oktiliontý", + 1e30: "nonilliontý", + 1e33: "decilliontý" + # TODO > 1e-33 +} +_SHORT_ORDINAL_CS.update(_ORDINAL_BASE_CS) + + +_LONG_ORDINAL_CS = { + 1e6: "miliontý", + 1e9: "miliardtý", + 1e12: "biliontý", + 1e15: "biliardtý", + 1e18: "triliontý", + 1e21: "triliardtý", + 1e24: "kvadriliontý", + 1e27: "kvadriliardtý", + 1e30: "kvintiliontý", + 1e33: "kvintiliardtý", + 1e36: "sextiliontý", + 1e39: "sextiliardtý", + 1e42: "septiliontý", + 1e45: "septiliardtý", + 1e48: "oktilion", + 1e51: "oktiliardtý", + 1e54: "noniliontý", + 1e57: "noniliardtý", + 1e60: "deciliontý" + # TODO > 1e60 +} +_LONG_ORDINAL_CS.update(_ORDINAL_BASE_CS) + +# Months + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_CZECH = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', + 'červenec', 'srpen', 'září', 'říjen', 'listopad', + 'prosinec'] + +# Time +_TIME_UNITS_CONVERSION = { + 'mikrosekund': 'microseconds', + 'milisekund': 'milliseconds', + 'sekundu': 'seconds', + 'sekundy': 'seconds', + 'sekund': 'seconds', + 'minutu': 'minutes', + 'minuty': 'minutes', + 'minut': 'minutes', + 'hodin': 'hours', + 'den': 'days', # 1 day + 'dny': 'days', # 2-4 days + 'dnů': 'days', # 5+ days + 'dní': 'days', # 5+ days - different inflection + 'dne': 'days', # a half day + 'týden': 'weeks', + 'týdny': 'weeks', + 'týdnů': 'weeks' +} diff --git a/lingua_franca/lang/common_data_da.py b/lingua_franca/lang/common_data_da.py new file mode 100644 index 0000000..0a87b52 --- /dev/null +++ b/lingua_franca/lang/common_data_da.py @@ -0,0 +1,133 @@ +_FUNCTION_NOT_IMPLEMENTED_WARNING = "Denne funktion er ikke implementeret i 'dk'." + +_DA_NUMBERS = { + 'nul': 0, + 'en': 1, + 'et': 1, + 'to': 2, + 'tre': 3, + 'fire': 4, + 'fem': 5, + 'seks': 6, + 'syv': 7, + 'otte': 8, + 'ni': 9, + 'ti': 10, + 'elve': 11, + 'tolv': 12, + 'tretten': 13, + 'fjorten': 14, + 'femten': 15, + 'seksten': 16, + 'sytten': 17, + 'atten': 18, + 'nitten': 19, + 'tyve': 20, + 'enogtyve': 21, + 'toogtyve': 22, + 'treogtyve': 23, + 'fireogtyve': 24, + 'femogtyve': 25, + 'seksogtyve': 26, + 'syvogtyve': 27, + 'otteogtyve': 28, + 'niogtyve': 29, + 'tredive': 30, + 'enogtredive': 31, + 'fyrrre': 40, + 'halvtres': 50, + 'tres': 60, + 'halvfjers': 70, + 'firs': 80, + 'halvfems': 90, + 'hunderede': 100, + 'tohundrede': 200, + 'trehundrede': 300, + 'firehundrede': 400, + 'femhundrede': 500, + 'sekshundrede': 600, + 'syvhundrede': 700, + 'ottehundrede': 800, + 'nihundrede': 900, + 'tusinde': 1000, + 'million': 1000000 +} + +_MONTHS_DA = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + +_NUM_STRING_DA = { + 0: 'nul', + 1: 'en', + 2: 'to', + 3: 'tre', + 4: 'fire', + 5: 'fem', + 6: 'seks', + 7: 'syv', + 8: 'otte', + 9: 'ni', + 10: 'ti', + 11: 'elve', + 12: 'tolv', + 13: 'tretten', + 14: 'fjorten', + 15: 'femten', + 16: 'seksten', + 17: 'sytten', + 18: 'atten', + 19: 'nitten', + 20: 'tyve', + 30: 'tredive', + 40: 'fyrre', + 50: 'halvtres', + 60: 'tres', + 70: 'halvfjers', + 80: 'firs', + 90: 'halvfems', + 100: 'hundrede' +} + +_NUM_POWERS_OF_TEN = [ + 'hundred', + 'tusind', + 'million', + 'milliard', + 'billion', + 'billiard', + 'trillion', + 'trilliard' +] + +_FRACTION_STRING_DA = { + 2: 'halv', + 3: 'trediedel', + 4: 'fjerdedel', + 5: 'femtedel', + 6: 'sjettedel', + 7: 'syvendedel', + 8: 'ottendedel', + 9: 'niendedel', + 10: 'tiendedel', + 11: 'elftedel', + 12: 'tolvtedel', + 13: 'trettendedel', + 14: 'fjortendedel', + 15: 'femtendedel', + 16: 'sejstendedel', + 17: 'syttendedel', + 18: 'attendedel', + 19: 'nittendedel', + 20: 'tyvendedel' +} + +# Numbers below 1 million are written in one word in Danish, yielding very +# long words +# In some circumstances it may better to seperate individual words +# Set _EXTRA_SPACE_DA=" " for separating numbers below 1 million ( +# orthographically incorrect) +# Set _EXTRA_SPACE_DA="" for correct spelling, this is standard + +# _EXTRA_SPACE_DA = " " +_EXTRA_SPACE_DA = "" diff --git a/lingua_franca/lang/common_data_de.py b/lingua_franca/lang/common_data_de.py new file mode 100644 index 0000000..abb9bec --- /dev/null +++ b/lingua_franca/lang/common_data_de.py @@ -0,0 +1,135 @@ +_DE_NUMBERS = { + 'null': 0, + 'ein': 1, + 'eins': 1, + 'eine': 1, + 'einer': 1, + 'einem': 1, + 'einen': 1, + 'eines': 1, + 'zwei': 2, + 'drei': 3, + 'vier': 4, + 'fünf': 5, + 'sechs': 6, + 'sieben': 7, + 'acht': 8, + 'neun': 9, + 'zehn': 10, + 'elf': 11, + 'zwölf': 12, + 'dreizehn': 13, + 'vierzehn': 14, + 'fünfzehn': 15, + 'sechzehn': 16, + 'siebzehn': 17, + 'achtzehn': 18, + 'neunzehn': 19, + 'zwanzig': 20, + 'einundzwanzig': 21, + 'zweiundzwanzig': 22, + 'dreiundzwanzig': 23, + 'vierundzwanzig': 24, + 'fünfundzwanzig': 25, + 'sechsundzwanzig': 26, + 'siebenundzwanzig': 27, + 'achtundzwanzig': 28, + 'neunundzwanzig': 29, + 'dreißig': 30, + 'einunddreißig': 31, + 'vierzig': 40, + 'fünfzig': 50, + 'sechzig': 60, + 'siebzig': 70, + 'achtzig': 80, + 'neunzig': 90, + 'hundert': 100, + 'zweihundert': 200, + 'dreihundert': 300, + 'vierhundert': 400, + 'fünfhundert': 500, + 'sechshundert': 600, + 'siebenhundert': 700, + 'achthundert': 800, + 'neunhundert': 900, + 'tausend': 1000, + 'million': 1000000 +} + +_MONTHS_DE = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + +_NUM_STRING_DE = { + 0: 'null', + 1: 'ein', # ein Viertel etc., nicht eins Viertel + 2: 'zwei', + 3: 'drei', + 4: 'vier', + 5: 'fünf', + 6: 'sechs', + 7: 'sieben', + 8: 'acht', + 9: 'neun', + 10: 'zehn', + 11: 'elf', + 12: 'zwölf', + 13: 'dreizehn', + 14: 'vierzehn', + 15: 'fünfzehn', + 16: 'sechzehn', + 17: 'siebzehn', + 18: 'achtzehn', + 19: 'neunzehn', + 20: 'zwanzig', + 30: 'dreißig', + 40: 'vierzig', + 50: 'fünfzig', + 60: 'sechzig', + 70: 'siebzig', + 80: 'achtzig', + 90: 'neunzig', + 100: 'hundert' +} + +# German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales +# Currently, numbers are limited to 1000000000000000000000000, +# but _NUM_POWERS_OF_TEN can be extended to include additional number words + + +_NUM_POWERS_OF_TEN_DE = [ + '', 'tausend', 'Million', 'Milliarde', 'Billion', 'Billiarde', 'Trillion', + 'Trilliarde' +] + +_FRACTION_STRING_DE = { + 2: 'halb', + 3: 'drittel', + 4: 'viertel', + 5: 'fünftel', + 6: 'sechstel', + 7: 'siebtel', + 8: 'achtel', + 9: 'neuntel', + 10: 'zehntel', + 11: 'elftel', + 12: 'zwölftel', + 13: 'dreizehntel', + 14: 'vierzehntel', + 15: 'fünfzehntel', + 16: 'sechzehntel', + 17: 'siebzehntel', + 18: 'achtzehntel', + 19: 'neunzehntel', + 20: 'zwanzigstel' +} + +# Numbers below 1 million are written in one word in German, yielding very +# long words +# In some circumstances it may better to seperate individual words +# Set _EXTRA_SPACE_DA=" " for separating numbers below 1 million ( +# orthographically incorrect) +# Set _EXTRA_SPACE_DA="" for correct spelling, this is standard + +# _EXTRA_SPACE_DA = " " +_EXTRA_SPACE_DE = "" diff --git a/lingua_franca/lang/common_data_en.py b/lingua_franca/lang/common_data_en.py new file mode 100644 index 0000000..f2f8de1 --- /dev/null +++ b/lingua_franca/lang/common_data_en.py @@ -0,0 +1,297 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_FUNCTION_NOT_IMPLEMENTED_WARNING = "The requested function is not implemented in English." + +_ARTICLES_EN = {'a', 'an', 'the'} + + +_NUM_STRING_EN = { + 0: 'zero', + 1: 'one', + 2: 'two', + 3: 'three', + 4: 'four', + 5: 'five', + 6: 'six', + 7: 'seven', + 8: 'eight', + 9: 'nine', + 10: 'ten', + 11: 'eleven', + 12: 'twelve', + 13: 'thirteen', + 14: 'fourteen', + 15: 'fifteen', + 16: 'sixteen', + 17: 'seventeen', + 18: 'eighteen', + 19: 'nineteen', + 20: 'twenty', + 30: 'thirty', + 40: 'forty', + 50: 'fifty', + 60: 'sixty', + 70: 'seventy', + 80: 'eighty', + 90: 'ninety' +} + + +_FRACTION_STRING_EN = { + 2: 'half', + 3: 'third', + 4: 'forth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eigth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelveth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentyith' +} + + +_LONG_SCALE_EN = OrderedDict([ + (100, 'hundred'), + (1000, 'thousand'), + (1000000, 'million'), + (1e12, "billion"), + (1e18, 'trillion'), + (1e24, "quadrillion"), + (1e30, "quintillion"), + (1e36, "sextillion"), + (1e42, "septillion"), + (1e48, "octillion"), + (1e54, "nonillion"), + (1e60, "decillion"), + (1e66, "undecillion"), + (1e72, "duodecillion"), + (1e78, "tredecillion"), + (1e84, "quattuordecillion"), + (1e90, "quinquadecillion"), + (1e96, "sedecillion"), + (1e102, "septendecillion"), + (1e108, "octodecillion"), + (1e114, "novendecillion"), + (1e120, "vigintillion"), + (1e306, "unquinquagintillion"), + (1e312, "duoquinquagintillion"), + (1e336, "sesquinquagintillion"), + (1e366, "unsexagintillion") +]) + + +_SHORT_SCALE_EN = OrderedDict([ + (100, 'hundred'), + (1000, 'thousand'), + (1000000, 'million'), + (1e9, "billion"), + (1e12, 'trillion'), + (1e15, "quadrillion"), + (1e18, "quintillion"), + (1e21, "sextillion"), + (1e24, "septillion"), + (1e27, "octillion"), + (1e30, "nonillion"), + (1e33, "decillion"), + (1e36, "undecillion"), + (1e39, "duodecillion"), + (1e42, "tredecillion"), + (1e45, "quattuordecillion"), + (1e48, "quinquadecillion"), + (1e51, "sedecillion"), + (1e54, "septendecillion"), + (1e57, "octodecillion"), + (1e60, "novendecillion"), + (1e63, "vigintillion"), + (1e66, "unvigintillion"), + (1e69, "uuovigintillion"), + (1e72, "tresvigintillion"), + (1e75, "quattuorvigintillion"), + (1e78, "quinquavigintillion"), + (1e81, "qesvigintillion"), + (1e84, "septemvigintillion"), + (1e87, "octovigintillion"), + (1e90, "novemvigintillion"), + (1e93, "trigintillion"), + (1e96, "untrigintillion"), + (1e99, "duotrigintillion"), + (1e102, "trestrigintillion"), + (1e105, "quattuortrigintillion"), + (1e108, "quinquatrigintillion"), + (1e111, "sestrigintillion"), + (1e114, "septentrigintillion"), + (1e117, "octotrigintillion"), + (1e120, "noventrigintillion"), + (1e123, "quadragintillion"), + (1e153, "quinquagintillion"), + (1e183, "sexagintillion"), + (1e213, "septuagintillion"), + (1e243, "octogintillion"), + (1e273, "nonagintillion"), + (1e303, "centillion"), + (1e306, "uncentillion"), + (1e309, "duocentillion"), + (1e312, "trescentillion"), + (1e333, "decicentillion"), + (1e336, "undecicentillion"), + (1e363, "viginticentillion"), + (1e366, "unviginticentillion"), + (1e393, "trigintacentillion"), + (1e423, "quadragintacentillion"), + (1e453, "quinquagintacentillion"), + (1e483, "sexagintacentillion"), + (1e513, "septuagintacentillion"), + (1e543, "ctogintacentillion"), + (1e573, "nonagintacentillion"), + (1e603, "ducentillion"), + (1e903, "trecentillion"), + (1e1203, "quadringentillion"), + (1e1503, "quingentillion"), + (1e1803, "sescentillion"), + (1e2103, "septingentillion"), + (1e2403, "octingentillion"), + (1e2703, "nongentillion"), + (1e3003, "millinillion") +]) + + +_ORDINAL_BASE_EN = { + 1: 'first', + 2: 'second', + 3: 'third', + 4: 'fourth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eighth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelfth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentieth', + 30: 'thirtieth', + 40: "fortieth", + 50: "fiftieth", + 60: "sixtieth", + 70: "seventieth", + 80: "eightieth", + 90: "ninetieth", + 1e2: "hundredth", + 1e3: "thousandth" +} + + +_SHORT_ORDINAL_EN = { + 1e6: "millionth", + 1e9: "billionth", + 1e12: "trillionth", + 1e15: "quadrillionth", + 1e18: "quintillionth", + 1e21: "sextillionth", + 1e24: "septillionth", + 1e27: "octillionth", + 1e30: "nonillionth", + 1e33: "decillionth" + # TODO > 1e-33 +} +_SHORT_ORDINAL_EN.update(_ORDINAL_BASE_EN) + + +_LONG_ORDINAL_EN = { + 1e6: "millionth", + 1e12: "billionth", + 1e18: "trillionth", + 1e24: "quadrillionth", + 1e30: "quintillionth", + 1e36: "sextillionth", + 1e42: "septillionth", + 1e48: "octillionth", + 1e54: "nonillionth", + 1e60: "decillionth" + # TODO > 1e60 +} +_LONG_ORDINAL_EN.update(_ORDINAL_BASE_EN) + + +# negate next number (-2 = 0 - 2) +_NEGATIVES_EN = {"negative", "minus"} + +# sum the next number (twenty two = 20 + 2) +_SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50', + 'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'} + + +def _generate_plurals_en(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + # TODO migrate to https://github.com/MycroftAI/lingua-franca/pull/36 + if isinstance(originals, dict): + return {key + 's': value for key, value in originals.items()} + return {value + "s" for value in originals} + + +_MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \ + _generate_plurals_en(_LONG_SCALE_EN.values()) + +_MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \ + _generate_plurals_en(_SHORT_SCALE_EN.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER_EN = {"and"} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER_EN = {"point", "dot"} + +_STRING_NUM_EN = invert_dict(_NUM_STRING_EN) +_STRING_NUM_EN.update(_generate_plurals_en(_STRING_NUM_EN)) + +_SPOKEN_EXTRA_NUM_EN = { + "half": 0.5, + "halves": 0.5, + "couple": 2 + } +_STRING_SHORT_ORDINAL_EN = invert_dict(_SHORT_ORDINAL_EN) +_STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN) diff --git a/lingua_franca/lang/common_data_es.py b/lingua_franca/lang/common_data_es.py new file mode 100644 index 0000000..f522d13 --- /dev/null +++ b/lingua_franca/lang/common_data_es.py @@ -0,0 +1,313 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# NOTE: This file as no use yet. It needs to be called from other functions + +from collections import OrderedDict + + +_ARTICLES_ES = {'el', 'la', 'los', 'las'} + +_NUM_STRING_ES = { + 0: 'cero', + 1: 'uno', + 2: 'dos', + 3: 'tres', + 4: 'cuatro', + 5: 'cinco', + 6: 'seis', + 7: 'siete', + 8: 'ocho', + 9: 'nueve', + 10: 'diez', + 11: 'once', + 12: 'doce', + 13: 'trece', + 14: 'catorce', + 15: 'quince', + 16: 'dieciséis', + 17: 'diecisete', + 18: 'dieciocho', + 19: 'diecinueve', + 20: 'veinte', + 30: 'treinta', + 40: 'cuarenta', + 50: 'cincuenta', + 60: 'sesenta', + 70: 'setenta', + 80: 'ochenta', + 90: 'noventa' +} + +_STRING_NUM_ES = { + "cero": 0, + "un": 1, + "uno": 1, + "una": 1, + "dos": 2, + "tres": 3, + "trés": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "dieciséis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + "veintid�s": 22, + "veintitr�s": 23, + "veintidos": 22, + "veintitres": 23, + "veintitrés": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiséis": 26, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + "cien": 100, + "ciento": 100, + "doscientos": 200, + "doscientas": 200, + "trescientos": 300, + "trescientas": 300, + "cuatrocientos": 400, + "cuatrocientas": 400, + "quinientos": 500, + "quinientas": 500, + "seiscientos": 600, + "seiscientas": 600, + "setecientos": 700, + "setecientas": 700, + "ochocientos": 800, + "ochocientas": 800, + "novecientos": 900, + "novecientas": 900, + "mil": 1000} + + +_FRACTION_STRING_ES = { + 2: 'medio', + 3: 'tercio', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'octavo', + 9: 'noveno', + 10: 'décimo', + 11: 'onceavo', + 12: 'doceavo', + 13: 'treceavo', + 14: 'catorceavo', + 15: 'quinceavo', + 16: 'dieciseisavo', + 17: 'diecisieteavo', + 18: 'dieciochoavo', + 19: 'diecinueveavo', + 20: 'veinteavo' +} + +# https://www.grobauer.at/es_eur/zahlnamen.php +_LONG_SCALE_ES = OrderedDict([ + (100, 'centena'), + (1000, 'millar'), + (1000000, 'millón'), + (1e9, "millardo"), + (1e12, "billón"), + (1e18, 'trillón'), + (1e24, "cuatrillón"), + (1e30, "quintillón"), + (1e36, "sextillón"), + (1e42, "septillón"), + (1e48, "octillón"), + (1e54, "nonillón"), + (1e60, "decillón"), + (1e66, "undecillón"), + (1e72, "duodecillón"), + (1e78, "tredecillón"), + (1e84, "cuatrodecillón"), + (1e90, "quindecillón"), + (1e96, "sexdecillón"), + (1e102, "septendecillón"), + (1e108, "octodecillón"), + (1e114, "novendecillón"), + (1e120, "vigintillón"), + (1e306, "unquinquagintillón"), + (1e312, "duoquinquagintillón"), + (1e336, "sexquinquagintillón"), + (1e366, "unsexagintillón") +]) + + +_SHORT_SCALE_ES = OrderedDict([ + (100, 'centena'), + (1000, 'millar'), + (1000000, 'millón'), + (1e9, "billón"), + (1e12, 'trillón'), + (1e15, "cuatrillón"), + (1e18, "quintillón"), + (1e21, "sextillón"), + (1e24, "septillón"), + (1e27, "octillón"), + (1e30, "nonillón"), + (1e33, "decillón"), + (1e36, "undecillón"), + (1e39, "duodecillón"), + (1e42, "tredecillón"), + (1e45, "cuatrodecillón"), + (1e48, "quindecillón"), + (1e51, "sexdecillón"), + (1e54, "septendecillón"), + (1e57, "octodecillón"), + (1e60, "novendecillón"), + (1e63, "vigintillón"), + (1e66, "unvigintillón"), + (1e69, "uuovigintillón"), + (1e72, "tresvigintillón"), + (1e75, "quattuorvigintillón"), + (1e78, "quinquavigintillón"), + (1e81, "qesvigintillón"), + (1e84, "septemvigintillón"), + (1e87, "octovigintillón"), + (1e90, "novemvigintillón"), + (1e93, "trigintillón"), + (1e96, "untrigintillón"), + (1e99, "duotrigintillón"), + (1e102, "trestrigintillón"), + (1e105, "quattuortrigintillón"), + (1e108, "quinquatrigintillón"), + (1e111, "sestrigintillón"), + (1e114, "septentrigintillón"), + (1e117, "octotrigintillón"), + (1e120, "noventrigintillón"), + (1e123, "quadragintillón"), + (1e153, "quinquagintillón"), + (1e183, "sexagintillón"), + (1e213, "septuagintillón"), + (1e243, "octogintillón"), + (1e273, "nonagintillón"), + (1e303, "centillón"), + (1e306, "uncentillón"), + (1e309, "duocentillón"), + (1e312, "trescentillón"), + (1e333, "decicentillón"), + (1e336, "undecicentillón"), + (1e363, "viginticentillón"), + (1e366, "unviginticentillón"), + (1e393, "trigintacentillón"), + (1e423, "quadragintacentillón"), + (1e453, "quinquagintacentillón"), + (1e483, "sexagintacentillón"), + (1e513, "septuagintacentillón"), + (1e543, "ctogintacentillón"), + (1e573, "nonagintacentillón"), + (1e603, "ducentillón"), + (1e903, "trecentillón"), + (1e1203, "quadringentillón"), + (1e1503, "quingentillón"), + (1e1803, "sexcentillón"), + (1e2103, "septingentillón"), + (1e2403, "octingentillón"), + (1e2703, "nongentillón"), + (1e3003, "millinillón") +]) + +# TODO: female forms. +_ORDINAL_STRING_BASE_ES = { + 1: 'primero', + 2: 'segundo', + 3: 'tercero', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'octavo', + 9: 'noveno', + 10: 'décimo', + 11: 'undécimo', + 12: 'duodécimo', + 13: 'decimotercero', + 14: 'decimocuarto', + 15: 'decimoquinto', + 16: 'decimosexto', + 17: 'decimoséptimo', + 18: 'decimoctavo', + 19: 'decimonoveno', + 20: 'vigésimo', + 30: 'trigésimo', + 40: "cuadragésimo", + 50: "quincuagésimo", + 60: "sexagésimo", + 70: "septuagésimo", + 80: "octogésimo", + 90: "nonagésimo", + 10e3: "centésimó", + 1e3: "milésimo" +} + + +_SHORT_ORDINAL_STRING_ES = { + 1e6: "millonésimo", + 1e9: "milmillonésimo", + 1e12: "billonésimo", + 1e15: "milbillonésimo", + 1e18: "trillonésimo", + 1e21: "miltrillonésimo", + 1e24: "cuatrillonésimo", + 1e27: "milcuatrillonésimo", + 1e30: "quintillonésimo", + 1e33: "milquintillonésimo" + # TODO > 1e-33 +} +_SHORT_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) + + +_LONG_ORDINAL_STRING_ES = { + 1e6: "millonésimo", + 1e12: "billionth", + 1e18: "trillonésimo", + 1e24: "cuatrillonésimo", + 1e30: "quintillonésimo", + 1e36: "sextillonésimo", + 1e42: "septillonésimo", + 1e48: "octillonésimo", + 1e54: "nonillonésimo", + 1e60: "decillonésimo" + # TODO > 1e60 +} +_LONG_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) diff --git a/lingua_franca/lang/common_data_fa.py b/lingua_franca/lang/common_data_fa.py new file mode 100644 index 0000000..f44a219 --- /dev/null +++ b/lingua_franca/lang/common_data_fa.py @@ -0,0 +1,115 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_FUNCTION_NOT_IMPLEMENTED_WARNING = "تابع خواسته شده در زبان فارسی پیاده سازی نشده است." + + +_FRACTION_STRING_FA = { + 2: 'دوم', + 3: 'سوم', + 4: 'چهارم', + 5: 'پنجم', + 6: 'ششم', + 7: 'هفتم', + 8: 'هشتم', + 9: 'نهم', + 10: 'دهم', + 11: 'یازدهم', + 12: 'دوازدهم', + 13: 'سیزدهم', + 14: 'چهاردهم', + 15: 'پونزدهم', + 16: 'شونزدهم', + 17: 'هیفدهم', + 18: 'هیجدهم', + 19: 'نوزدهم', + 20: 'بیستم' +} + + +_FARSI_ONES = [ + "", + "یک", + "دو", + "سه", + "چهار", + "پنج", + "شش", + "هفت", + "هشت", + "نه", + "ده", + "یازده", + "دوازده", + "سیزده", + "چهارده", + "پونزده", + "شونزده", + "هیفده", + "هیجده", + "نوزده", +] + +_FARSI_TENS = [ + "", + "ده", + "بیست", + "سی", + "چهل", + "پنجاه", + "شصت", + "هفتاد", + "هشتاد", + "نود", +] + +_FARSI_HUNDREDS = [ + "", + "صد", + "دویست", + "سیصد", + "چهارصد", + "پانصد", + "ششصد", + "هفتصد", + "هشتصد", + "نهصد", +] + +_FARSI_BIG = [ + '', + 'هزار', + 'میلیون', + "میلیارد", + 'تریلیون', + "تریلیارد", +] + + +_FORMAL_VARIANT = { + 'هفده': 'هیفده', + 'هجده': 'هیجده', + 'شانزده': 'شونزده', + 'پانزده': 'پونزده', +} + + +_FARSI_FRAC = ["", "ده", "صد"] +_FARSI_FRAC_BIG = ["", "هزار", "میلیونی", "میلیاردی"] + +_FARSI_SEPERATOR = ' و ' \ No newline at end of file diff --git a/lingua_franca/lang/common_data_fr.py b/lingua_franca/lang/common_data_fr.py new file mode 100644 index 0000000..e7abcc6 --- /dev/null +++ b/lingua_franca/lang/common_data_fr.py @@ -0,0 +1,98 @@ +# Undefined articles ["un", "une"] cannot be supressed, +# in French, "un cheval" means "a horse" or "one horse". +_ARTICLES_FR = ["le", "la", "du", "de", "les", "des"] + +_NUMBERS_FR = { + "zéro": 0, + "un": 1, + "une": 1, + "deux": 2, + "trois": 3, + "quatre": 4, + "cinq": 5, + "six": 6, + "sept": 7, + "huit": 8, + "neuf": 9, + "dix": 10, + "onze": 11, + "douze": 12, + "treize": 13, + "quatorze": 14, + "quinze": 15, + "seize": 16, + "vingt": 20, + "trente": 30, + "quarante": 40, + "cinquante": 50, + "soixante": 60, + "soixante-dix": 70, + "septante": 70, + "quatre-vingt": 80, + "quatre-vingts": 80, + "octante": 80, + "huitante": 80, + "quatre-vingt-dix": 90, + "nonante": 90, + "cent": 100, + "cents": 100, + "mille": 1000, + "mil": 1000, + "millier": 1000, + "milliers": 1000, + "million": 1000000, + "millions": 1000000, + "milliard": 1000000000, + "milliards": 1000000000} + +_ORDINAL_ENDINGS_FR = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e") + +_NUM_STRING_FR = { + 0: 'zéro', + 1: 'un', + 2: 'deux', + 3: 'trois', + 4: 'quatre', + 5: 'cinq', + 6: 'six', + 7: 'sept', + 8: 'huit', + 9: 'neuf', + 10: 'dix', + 11: 'onze', + 12: 'douze', + 13: 'treize', + 14: 'quatorze', + 15: 'quinze', + 16: 'seize', + 20: 'vingt', + 30: 'trente', + 40: 'quarante', + 50: 'cinquante', + 60: 'soixante', + 70: 'soixante-dix', + 80: 'quatre-vingt', + 90: 'quatre-vingt-dix' +} + +_FRACTION_STRING_FR = { + 2: 'demi', + 3: 'tiers', + 4: 'quart', + 5: 'cinquième', + 6: 'sixième', + 7: 'septième', + 8: 'huitième', + 9: 'neuvième', + 10: 'dixième', + 11: 'onzième', + 12: 'douzième', + 13: 'treizième', + 14: 'quatorzième', + 15: 'quinzième', + 16: 'seizième', + 17: 'dix-septième', + 18: 'dix-huitième', + 19: 'dix-neuvième', + 20: 'vingtième' +} diff --git a/lingua_franca/lang/common_data_hu.py b/lingua_franca/lang/common_data_hu.py new file mode 100644 index 0000000..7f6f370 --- /dev/null +++ b/lingua_franca/lang/common_data_hu.py @@ -0,0 +1,77 @@ +_MONTHS_HU = ['január', 'február', 'március', 'április', 'május', 'június', + 'július', 'augusztus', 'szeptember', 'október', 'november', + 'december'] + +_NUM_STRING_HU = { + 0: 'nulla', + 1: 'egy', + 2: 'kettő', + 3: 'három', + 4: 'négy', + 5: 'öt', + 6: 'hat', + 7: 'hét', + 8: 'nyolc', + 9: 'kilenc', + 10: 'tíz', + 11: 'tizenegy', + 12: 'tizenkettő', + 13: 'tizenhárom', + 14: 'tizennégy', + 15: 'tizenöt', + 16: 'tizenhat', + 17: 'tizenhét', + 18: 'tizennyolc', + 19: 'tizenkilenc', + 20: 'húsz', + 30: 'harminc', + 40: 'negyven', + 50: 'ötven', + 60: 'hatvan', + 70: 'hetven', + 80: 'nyolcvan', + 90: 'kilencven', + 100: 'száz' +} + +# Hungarian uses "long scale" +# https://en.wikipedia.org/wiki/Long_and_short_scales +# Currently, numbers are limited to 1000000000000000000000000, +# but _NUM_POWERS_OF_TEN can be extended to include additional number words + +_NUM_POWERS_OF_TEN = [ + '', 'ezer', 'millió', 'milliárd', 'billió', 'billiárd', 'trillió', + 'trilliárd' +] + +_FRACTION_STRING_HU = { + 2: 'fél', + 3: 'harmad', + 4: 'negyed', + 5: 'ötöd', + 6: 'hatod', + 7: 'heted', + 8: 'nyolcad', + 9: 'kilenced', + 10: 'tized', + 11: 'tizenegyed', + 12: 'tizenketted', + 13: 'tizenharmad', + 14: 'tizennegyed', + 15: 'tizenötöd', + 16: 'tizenhatod', + 17: 'tizenheted', + 18: 'tizennyolcad', + 19: 'tizenkilenced', + 20: 'huszad' +} + +# Numbers below 2 thousand are written in one word in Hungarian +# Numbers above 2 thousand are separated by hyphens +# In some circumstances it may better to seperate individual words +# Set _EXTRA_SPACE_HU=" " for separating numbers below 2 thousand ( +# orthographically incorrect) +# Set _EXTRA_SPACE_HU="" for correct spelling, this is standard + +# _EXTRA_SPACE_HU = " " +_EXTRA_SPACE_HU = "" diff --git a/lingua_franca/lang/common_data_it.py b/lingua_franca/lang/common_data_it.py new file mode 100644 index 0000000..eed0edc --- /dev/null +++ b/lingua_franca/lang/common_data_it.py @@ -0,0 +1,321 @@ +import collections + + +_SHORT_ORDINAL_STRING_IT = { + 1: 'primo', + 2: 'secondo', + 3: 'terzo', + 4: 'quarto', + 5: 'quinto', + 6: 'sesto', + 7: 'settimo', + 8: 'ottavo', + 9: 'nono', + 10: 'decimo', + 11: 'undicesimo', + 12: 'dodicesimo', + 13: 'tredicesimo', + 14: 'quattordicesimo', + 15: 'quindicesimo', + 16: 'sedicesimo', + 17: 'diciassettesimo', + 18: 'diciottesimo', + 19: 'diciannovesimo', + 20: 'ventesimo', + 30: 'trentesimo', + 40: 'quarantesimo', + 50: 'cinquantesimo', + 60: 'sessantesimo', + 70: 'settantesimo', + 80: 'ottantesimo', + 90: 'novantesimo', + 1e2: 'centesimo', + 1e3: 'millesimo', + 1e6: 'milionesimo', + 1e9: 'miliardesimo', + 1e12: 'trilionesimo', + 1e15: 'quadrilionesimo', + 1e18: 'quintilionesim', + 1e21: 'sestilionesimo', + 1e24: 'settilionesimo', + 1e27: 'ottilionesimo', + 1e30: 'nonilionesimo', + 1e33: 'decilionesimo' + # TODO > 1e-33 +} + +# per i > 10e12 modificata solo la desinenza: da sistemare a fine debug +_LONG_ORDINAL_STRING_IT = { + 1: 'primo', + 2: 'secondo', + 3: 'terzo', + 4: 'quarto', + 5: 'quinto', + 6: 'sesto', + 7: 'settimo', + 8: 'ottavo', + 9: 'nono', + 10: 'decimo', + 11: 'undicesimo', + 12: 'dodicesimo', + 13: 'tredicesimo', + 14: 'quattordicesimo', + 15: 'quindicesimo', + 16: 'sedicesimo', + 17: 'diciassettesimo', + 18: 'diciottesimo', + 19: 'diciannovesimo', + 20: 'ventesimo', + 30: 'trentesimo', + 40: 'quarantesimo', + 50: 'cinquantesimo', + 60: 'sessantesimo', + 70: 'settantesimo', + 80: 'ottantesimo', + 90: 'novantesimo', + 1e2: 'centesimo', + 1e3: 'millesimo', + 1e6: 'milionesimo', + 1e12: 'bilionesimo', + 1e18: 'trilionesimo', + 1e24: 'quadrilionesimo', + 1e30: 'quintilionesimo', + 1e36: 'sestilionesimo', + 1e42: 'settilionesimo', + 1e48: 'ottilionesimo', + 1e54: 'nonilionesimo', + 1e60: 'decilionesimo' + # TODO > 1e60 +} + +# Undefined articles ['un', 'una', 'un\''] can not be supressed, +# in Italian, 'un cavallo' means 'a horse' or 'one horse'. +_ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le'] + +_STRING_NUM_IT = { + 'zero': 0, + 'un': 1, + 'uno': 1, + 'una': 1, + 'un\'': 1, + 'due': 2, + 'tre': 3, + 'quattro': 4, + 'cinque': 5, + 'sei': 6, + 'sette': 7, + 'otto': 8, + 'nove': 9, + 'dieci': 10, + 'undici': 11, + 'dodici': 12, + 'tredici': 13, + 'quattordici': 14, + 'quindici': 15, + 'sedici': 16, + 'diciassette': 17, + 'diciotto': 18, + 'diciannove': 19, + 'venti': 20, + 'vent': 20, + 'trenta': 30, + 'trent': 30, + 'quaranta': 40, + 'quarant': 40, + 'cinquanta': 50, + 'cinquant': 50, + 'sessanta': 60, + 'sessant': 60, + 'settanta': 70, + 'settant': 70, + 'ottanta': 80, + 'ottant': 80, + 'novanta': 90, + 'novant': 90, + 'cento': 100, + 'duecento': 200, + 'trecento': 300, + 'quattrocento': 400, + 'cinquecento': 500, + 'seicento': 600, + 'settecento': 700, + 'ottocento': 800, + 'novecento': 900, + 'mille': 1000, + 'mila': 1000, + 'centomila': 100000, + 'milione': 1000000, + 'miliardo': 1000000000, + 'primo': 1, + 'secondo': 2, + 'mezzo': 0.5, + 'mezza': 0.5, + 'paio': 2, + 'decina': 10, + 'decine': 10, + 'dozzina': 12, + 'dozzine': 12, + 'centinaio': 100, + 'centinaia': 100, + 'migliaio': 1000, + 'migliaia': 1000 +} + +_NUM_STRING_IT = { + 0: 'zero', + 1: 'uno', + 2: 'due', + 3: 'tre', + 4: 'quattro', + 5: 'cinque', + 6: 'sei', + 7: 'sette', + 8: 'otto', + 9: 'nove', + 10: 'dieci', + 11: 'undici', + 12: 'dodici', + 13: 'tredici', + 14: 'quattordici', + 15: 'quindici', + 16: 'sedici', + 17: 'diciassette', + 18: 'diciotto', + 19: 'diciannove', + 20: 'venti', + 30: 'trenta', + 40: 'quaranta', + 50: 'cinquanta', + 60: 'sessanta', + 70: 'settanta', + 80: 'ottanta', + 90: 'novanta' +} + +_FRACTION_STRING_IT = { + 2: 'mezz', + 3: 'terz', + 4: 'quart', + 5: 'quint', + 6: 'sest', + 7: 'settim', + 8: 'ottav', + 9: 'non', + 10: 'decim', + 11: 'undicesim', + 12: 'dodicesim', + 13: 'tredicesim', + 14: 'quattordicesim', + 15: 'quindicesim', + 16: 'sedicesim', + 17: 'diciassettesim', + 18: 'diciottesim', + 19: 'diciannovesim', + 20: 'ventesim' +} + +# fonte: http://tulengua.es/numeros-texto/default.aspx +_LONG_SCALE_IT = collections.OrderedDict([ + (100, 'cento'), + (1000, 'mila'), + (1000000, 'milioni'), + (1e9, "miliardi"), + (1e12, "bilioni"), + (1e18, 'trilioni'), + (1e24, "quadrilioni"), + (1e30, "quintilioni"), + (1e36, "sestilioni"), + (1e42, "settilioni"), + (1e48, "ottillioni"), + (1e54, "nonillioni"), + (1e60, "decemillioni"), + (1e66, "undicilione"), + (1e72, "dodicilione"), + (1e78, "tredicilione"), + (1e84, "quattordicilione"), + (1e90, "quindicilione"), + (1e96, "sedicilione"), + (1e102, "diciasettilione"), + (1e108, "diciottilione"), + (1e114, "dicianovilione"), + (1e120, "vintilione"), + (1e306, "unquinquagintilione"), + (1e312, "duoquinquagintilione"), + (1e336, "sesquinquagintilione"), + (1e366, "unsexagintilione") +]) + + +_SHORT_SCALE_IT = collections.OrderedDict([ + (100, 'cento'), + (1000, 'mila'), + (1000000, 'milioni'), + (1e9, "miliardi"), + (1e12, 'bilioni'), + (1e15, "biliardi"), + (1e18, "trilioni"), + (1e21, "triliardi"), + (1e24, "quadrilioni"), + (1e27, "quadriliardi"), + (1e30, "quintilioni"), + (1e33, "quintiliardi"), + (1e36, "sestilioni"), + (1e39, "sestiliardi"), + (1e42, "settilioni"), + (1e45, "settiliardi"), + (1e48, "ottilioni"), + (1e51, "ottiliardi"), + (1e54, "nonilioni"), + (1e57, "noniliardi"), + (1e60, "decilioni"), + (1e63, "deciliardi"), + (1e66, "undicilioni"), + (1e69, "undiciliardi"), + (1e72, "dodicilioni"), + (1e75, "dodiciliardi"), + (1e78, "tredicilioni"), + (1e81, "trediciliardi"), + (1e84, "quattordicilioni"), + (1e87, "quattordiciliardi"), + (1e90, "quindicilioni"), + (1e93, "quindiciliardi"), + (1e96, "sedicilioni"), + (1e99, "sediciliardi"), + (1e102, "diciassettilioni"), + (1e105, "diciassettiliardi"), + (1e108, "diciottilioni"), + (1e111, "diciottiliardi"), + (1e114, "dicianovilioni"), + (1e117, "dicianoviliardi"), + (1e120, "vintilioni"), + (1e123, "vintiliardi"), + (1e153, "quinquagintillion"), + (1e183, "sexagintillion"), + (1e213, "septuagintillion"), + (1e243, "ottogintilioni"), + (1e273, "nonigintillioni"), + (1e303, "centilioni"), + (1e306, "uncentilioni"), + (1e309, "duocentilioni"), + (1e312, "trecentilioni"), + (1e333, "decicentilioni"), + (1e336, "undicicentilioni"), + (1e363, "viginticentilioni"), + (1e366, "unviginticentilioni"), + (1e393, "trigintacentilioni"), + (1e423, "quadragintacentillion"), + (1e453, "quinquagintacentillion"), + (1e483, "sexagintacentillion"), + (1e513, "septuagintacentillion"), + (1e543, "ctogintacentillion"), + (1e573, "nonagintacentillion"), + (1e603, "ducentillion"), + (1e903, "trecentillion"), + (1e1203, "quadringentillion"), + (1e1503, "quingentillion"), + (1e1803, "sescentillion"), + (1e2103, "septingentillion"), + (1e2403, "octingentillion"), + (1e2703, "nongentillion"), + (1e3003, "millinillion") +]) diff --git a/lingua_franca/lang/common_data_nl.py b/lingua_franca/lang/common_data_nl.py new file mode 100644 index 0000000..1bed1b5 --- /dev/null +++ b/lingua_franca/lang/common_data_nl.py @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2019 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_ARTICLES_NL = {'de', 'het'} + +_NUM_STRING_NL = { + 0: 'nul', + 1: 'een', + 2: 'twee', + 3: 'drie', + 4: 'vier', + 5: 'vijf', + 6: 'zes', + 7: 'zeven', + 8: 'acht', + 9: 'negen', + 10: 'tien', + 11: 'elf', + 12: 'twaalf', + 13: 'dertien', + 14: 'veertien', + 15: 'vijftien', + 16: 'zestien', + 17: 'zeventien', + 18: 'achttien', + 19: 'negentien', + 20: 'twintig', + 30: 'dertig', + 40: 'veertig', + 50: 'vijftig', + 60: 'zestig', + 70: 'zeventig', + 80: 'tachtig', + 90: 'negentig' +} + +_FRACTION_STRING_NL = { + 2: 'half', + 3: 'derde', + 4: 'vierde', + 5: 'vijfde', + 6: 'zesde', + 7: 'zevende', + 8: 'achtste', + 9: 'negende', + 10: 'tiende', + 11: 'elfde', + 12: 'twaalfde', + 13: 'dertiende', + 14: 'veertiende', + 15: 'vijftiende', + 16: 'zestiende', + 17: 'zeventiende', + 18: 'achttiende', + 19: 'negentiende', + 20: 'twintigste' +} + +_LONG_SCALE_NL = OrderedDict([ + (100, 'honderd'), + (1000, 'duizend'), + (1000000, 'miljoen'), + (1e12, "biljoen"), + (1e18, 'triljoen'), + (1e24, "quadriljoen"), + (1e30, "quintillion"), + (1e36, "sextillion"), + (1e42, "septillion"), + (1e48, "octillion"), + (1e54, "nonillion"), + (1e60, "decillion"), + (1e66, "undecillion"), + (1e72, "duodecillion"), + (1e78, "tredecillion"), + (1e84, "quattuordecillion"), + (1e90, "quinquadecillion"), + (1e96, "sedecillion"), + (1e102, "septendecillion"), + (1e108, "octodecillion"), + (1e114, "novendecillion"), + (1e120, "vigintillion"), + (1e306, "unquinquagintillion"), + (1e312, "duoquinquagintillion"), + (1e336, "sesquinquagintillion"), + (1e366, "unsexagintillion") +]) + +_SHORT_SCALE_NL = OrderedDict([ + (100, 'honderd'), + (1000, 'duizend'), + (1000000, 'miljoen'), + (1e9, "miljard"), + (1e12, 'biljoen'), + (1e15, "quadrillion"), + (1e18, "quintiljoen"), + (1e21, "sextiljoen"), + (1e24, "septiljoen"), + (1e27, "octiljoen"), + (1e30, "noniljoen"), + (1e33, "deciljoen"), + (1e36, "undeciljoen"), + (1e39, "duodeciljoen"), + (1e42, "tredeciljoen"), + (1e45, "quattuordeciljoen"), + (1e48, "quinquadeciljoen"), + (1e51, "sedeciljoen"), + (1e54, "septendeciljoen"), + (1e57, "octodeciljoen"), + (1e60, "novendeciljoen"), + (1e63, "vigintiljoen"), + (1e66, "unvigintiljoen"), + (1e69, "uuovigintiljoen"), + (1e72, "tresvigintiljoen"), + (1e75, "quattuorvigintiljoen"), + (1e78, "quinquavigintiljoen"), + (1e81, "qesvigintiljoen"), + (1e84, "septemvigintiljoen"), + (1e87, "octovigintiljoen"), + (1e90, "novemvigintiljoen"), + (1e93, "trigintiljoen"), + (1e96, "untrigintiljoen"), + (1e99, "duotrigintiljoen"), + (1e102, "trestrigintiljoen"), + (1e105, "quattuortrigintiljoen"), + (1e108, "quinquatrigintiljoen"), + (1e111, "sestrigintiljoen"), + (1e114, "septentrigintiljoen"), + (1e117, "octotrigintiljoen"), + (1e120, "noventrigintiljoen"), + (1e123, "quadragintiljoen"), + (1e153, "quinquagintiljoen"), + (1e183, "sexagintiljoen"), + (1e213, "septuagintiljoen"), + (1e243, "octogintiljoen"), + (1e273, "nonagintiljoen"), + (1e303, "centiljoen"), + (1e306, "uncentiljoen"), + (1e309, "duocentiljoen"), + (1e312, "trescentiljoen"), + (1e333, "decicentiljoen"), + (1e336, "undecicentiljoen"), + (1e363, "viginticentiljoen"), + (1e366, "unviginticentiljoen"), + (1e393, "trigintacentiljoen"), + (1e423, "quadragintacentiljoen"), + (1e453, "quinquagintacentiljoen"), + (1e483, "sexagintacentiljoen"), + (1e513, "septuagintacentiljoen"), + (1e543, "ctogintacentiljoen"), + (1e573, "nonagintacentiljoen"), + (1e603, "ducentiljoen"), + (1e903, "trecentiljoen"), + (1e1203, "quadringentiljoen"), + (1e1503, "quingentiljoen"), + (1e1803, "sescentiljoen"), + (1e2103, "septingentiljoen"), + (1e2403, "octingentiljoen"), + (1e2703, "nongentiljoen"), + (1e3003, "milliniljoen") +]) + +_ORDINAL_STRING_BASE_NL = { + 1: 'eerste', + 2: 'tweede', + 3: 'derde', + 4: 'vierde', + 5: 'vijfde', + 6: 'zesde', + 7: 'zevende', + 8: 'achtste', + 9: 'negende', + 10: 'tiende', + 11: 'elfde', + 12: 'twaalfde', + 13: 'dertiende', + 14: 'veertiende', + 15: 'vijftiende', + 16: 'zestiende', + 17: 'zeventiende', + 18: 'achttiende', + 19: 'negentiende', + 20: 'twintigste', + 30: 'dertigste', + 40: "veertigste", + 50: "vijftigste", + 60: "zestigste", + 70: "zeventigste", + 80: "tachtigste", + 90: "negentigste", + 10e3: "honderdste", + 1e3: "duizendste" +} + +_SHORT_ORDINAL_STRING_NL = { + 1e6: "miloenste", + 1e9: "miljardste", + 1e12: "biljoenste", + 1e15: "biljardste", + 1e18: "triljoenste", + 1e21: "trijardste", + 1e24: "quadriljoenste", + 1e27: "quadriljardste", + 1e30: "quintiljoenste", + 1e33: "quintiljardste" + # TODO > 1e-33 +} +_SHORT_ORDINAL_STRING_NL.update(_ORDINAL_STRING_BASE_NL) + +_LONG_ORDINAL_STRING_NL = { + 1e6: "miloenste", + 1e9: "miljardste", + 1e12: "biljoenste", + 1e15: "biljardste", + 1e18: "triljoenste", + 1e21: "trijardste", + 1e24: "quadriljoenste", + 1e27: "quadriljardste", + 1e30: "quintiljoenste", + 1e33: "quintiljardste" + # TODO > 1e60 +} +_LONG_ORDINAL_STRING_NL.update(_ORDINAL_STRING_BASE_NL) + +# negate next number (-2 = 0 - 2) +_NEGATIVES_NL = {"min", "minus"} + +# sum the next number (twenty two = 20 + 2) +_SUMS_NL = {'twintig', '20', 'dertig', '30', 'veertig', '40', 'vijftig', '50', + 'zestig', '60', 'zeventig', '70', 'techtig', '80', 'negentig', + '90'} + +_MULTIPLIES_LONG_SCALE_NL = set(_LONG_SCALE_NL.values()) + +_MULTIPLIES_SHORT_SCALE_NL = set(_SHORT_SCALE_NL.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER_NL = {"en"} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER_NL = {"komma", "punt"} + +_STRING_NUM_NL = invert_dict(_NUM_STRING_NL) +_STRING_NUM_NL.update({ + "half": 0.5, + "driekwart": 0.75, + "anderhalf": 1.5, + "paar": 2 +}) + +_STRING_SHORT_ORDINAL_NL = invert_dict(_SHORT_ORDINAL_STRING_NL) +_STRING_LONG_ORDINAL_NL = invert_dict(_LONG_ORDINAL_STRING_NL) + +_MONTHS_NL = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', + 'juli', 'augustus', 'september', 'oktober', 'november', + 'december'] + +_NUM_STRING_NL = { + 0: 'nul', + 1: 'één', + 2: 'twee', + 3: 'drie', + 4: 'vier', + 5: 'vijf', + 6: 'zes', + 7: 'zeven', + 8: 'acht', + 9: 'negen', + 10: 'tien', + 11: 'elf', + 12: 'twaalf', + 13: 'dertien', + 14: 'veertien', + 15: 'vijftien', + 16: 'zestien', + 17: 'zeventien', + 18: 'actien', + 19: 'negentien', + 20: 'twintig', + 30: 'dertig', + 40: 'veertig', + 50: 'vijftig', + 60: 'zestig', + 70: 'zeventig', + 80: 'tachtig', + 90: 'negentig', + 100: 'honderd' +} + +# Dutch uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales +# Currently, numbers are limited to 1000000000000000000000000, +# but _NUM_POWERS_OF_TEN can be extended to include additional number words + + +_NUM_POWERS_OF_TEN = [ + '', 'duizend', 'miljoen', 'miljard', 'biljoen', 'biljard', 'triljoen', + 'triljard' +] + +# Numbers below 1 million are written in one word in dutch, yielding very +# long words +# In some circumstances it may better to seperate individual words +# Set _EXTRA_SPACE_NL=" " for separating numbers below 1 million ( +# orthographically incorrect) +# Set _EXTRA_SPACE_NL="" for correct spelling, this is standard + +# _EXTRA_SPACE_NL = " " +_EXTRA_SPACE_NL = "" diff --git a/lingua_franca/lang/common_data_pl.py b/lingua_franca/lang/common_data_pl.py new file mode 100644 index 0000000..77fbdf7 --- /dev/null +++ b/lingua_franca/lang/common_data_pl.py @@ -0,0 +1,497 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +_NUM_STRING_PL = { + 0: 'zero', + 1: 'jeden', + 2: 'dwa', + 3: 'trzy', + 4: 'cztery', + 5: 'pięć', + 6: 'sześć', + 7: 'siedem', + 8: 'osiem', + 9: 'dziewięć', + 10: 'dziesięć', + 11: 'jedenaście', + 12: 'dwanaście', + 13: 'trzynaście', + 14: 'czternaście', + 15: 'piętnaście', + 16: 'szesnaście', + 17: 'siedemnaście', + 18: 'osiemnaście', + 19: 'dziewiętnaście', + 20: 'dwadzieścia', + 30: 'trzydzieści', + 40: 'czterdzieści', + 50: 'pięćdziesiąt', + 60: 'sześćdziesiąt', + 70: 'siedemdziesiąt', + 80: 'osiemdziesiąt', + 90: 'dziewięćdziesiąt', + 100: 'sto', + 200: 'dwieście', + 300: 'trzysta', + 400: 'czterysta', + 500: 'pięćset', + 600: 'sześćset', + 700: 'siedemset', + 800: 'osiemset', + 900: 'dziewięćset', +} + + +_FRACTION_STRING_PL = { + 1: 'jedna', + 2: 'druga', + 3: 'trzecia', + 4: 'czwarta', + 5: 'piąta', + 6: 'szósta', + 7: 'siódma', + 8: 'ósma', + 9: 'dziewiąta', + 10: 'dziesiąta', + 11: 'jedenasta', + 12: 'dwunasta', + 13: 'trzynasta', + 14: 'czternasta', + 15: 'piętnasta', + 16: 'szesnasta', + 17: 'siedemnasta', + 18: 'osiemnasta', + 19: 'dziewiętnasta', + 20: 'dwudziesta', + 30: 'trzydziesta', + 40: 'czterdziesta', + 50: 'pięćdziesiąta', + 60: 'sześćdziesiąta', + 70: 'siedemdziesiąta', + 80: 'osiemdziesiąta', + 90: 'dziewięćdziesiąta', + 100: 'setna', + 200: 'dwusetna', + 300: 'trzysetna', + 400: 'czterysetna', + 500: 'pięćsetna', + 600: 'sześćsetna', + 700: 'siedemsetna', + 800: 'osiemsetna', + 900: 'dziewięćsetna', + 1000: 'tysięczna', +} + +_SHORT_SCALE_PL = OrderedDict([ + (100, 'sto'), + (200, 'dwieście'), + (300, 'trzysta'), + (400, 'czterysta'), + (500, 'pięćset'), + (600, 'sześćset'), + (700, 'siedemset'), + (800, 'osiemset'), + (900, 'dziewięćset'), + (1000, 'tysiąc'), + (1000000, 'milion'), + (1e9, "miliard"), + (1e12, 'bilion'), + (1e15, "biliard"), + (1e18, "trylion"), + (1e21, "sekstilion"), + (1e24, "kwadrylion"), + (1e27, "kwadryliard"), + (1e30, "kwintylion"), + (1e33, "kwintyliard"), + (1e36, "sekstylion"), + (1e39, "sekstyliard"), + (1e42, "septylion"), + (1e45, "septyliard"), + (1e48, "oktylion"), + (1e51, "oktyliard"), + (1e54, "nonilion"), + (1e57, "noniliard"), + (1e60, "decylion"), + (1e63, "decyliard"), + (1e66, "undecylion"), + (1e69, "undecyliard"), + (1e72, "duodecylion"), + (1e75, "duodecyliard"), + (1e78, "tredecylion"), + (1e81, "tredecyliard"), + (1e84, "kwartyduodecylion"), + (1e87, "kwartyduodecyliard"), + (1e90, "kwintyduodecylion"), + (1e93, "kwintyduodecyliard"), + (1e96, "seksdecylion"), + (1e99, "seksdecyliard"), + (1e102, "septydecylion"), + (1e105, "septydecyliard"), + (1e108, "oktodecylion"), + (1e111, "oktodecyliard"), + (1e114, "nondecylion"), + (1e117, "nondecyliard"), + (1e120, "wigintylion"), + (1e123, "wigintyliard"), + (1e153, "quinquagintylion"), + (1e183, "trycyliard"), + (1e213, "septuagintylion"), + (1e243, "kwadragiliard"), + (1e273, "nonagintylion"), + (1e303, "centezylion"), + (1e306, "uncentylion"), + (1e309, "duocentylion"), + (1e312, "trescentylion"), + (1e333, "decicentylion"), + (1e336, "undecicentylion"), + (1e363, "viginticentylion"), + (1e366, "unviginticentylion"), + (1e393, "trigintacentylion"), + (1e423, "quadragintacentylion"), + (1e453, "quinquagintacentylion"), + (1e483, "sexagintacentylion"), + (1e513, "septuagintacentylion"), + (1e543, "ctogintacentylion"), + (1e573, "nonagintacentylion"), + (1e603, "centyliard"), + (1e903, "trecentylion"), + (1e1203, "quadringentylion"), + (1e1503, "quingentylion"), + (1e1803, "sescentylion"), + (1e2103, "septingentylion"), + (1e2403, "octingentylion"), + (1e2703, "nongentylion"), + (1e3003, "milinylion") +]) + + +_ORDINAL_BASE_PL = { + 1: 'pierwszy', + 2: 'drugi', + 3: 'trzeci', + 4: 'czwarty', + 5: 'piąty', + 6: 'szósty', + 7: 'siódmy', + 8: 'ósmy', + 9: 'dziewiąty', + 10: 'dziesiąty', + 11: 'jedenasty', + 12: 'dwunasty', + 13: 'trzynasty', + 14: 'czternasty', + 15: 'piętnasty', + 16: 'szesnasty', + 17: 'siedemnasty', + 18: 'osiemnasty', + 19: 'dziewiętnasty', + 20: 'dwudziesty', + 30: 'trzydziesty', + 40: "czterdziesty", + 50: "pięćdziesiąty", + 60: "sześćdziesiąty", + 70: "siedemdziesiąty", + 80: "osiemdziesiąty", + 90: "dziewięćdziesiąty", + 1e2: "setny", + 1e3: "tysięczny" +} + + +_SHORT_ORDINAL_PL = { + 1e6: "milionowy", + 1e9: "miliardowy", + 1e12: "bilionowy", + 1e15: "biliardowy", + 1e18: "trylionowy", + 1e21: "tryliardowy", + 1e24: "kwadrylionowy", + 1e27: "kwadryliardowy", + 1e30: "kwintylionowy", + 1e33: "kwintyliardowy", + 1e36: "sektylionowy", + 1e42: "septylionowy", + 1e48: "oktylionowy", + 1e54: "nonylionowy", + 1e60: "decylionowy" + # TODO > 1e-33 +} +_SHORT_ORDINAL_PL.update(_ORDINAL_BASE_PL) + +_ALT_ORDINALS_PL = { + 1: 'pierwszej', + 2: 'drugiej', + 3: 'trzeciej', + 4: 'czwartej', + 5: 'piątej', + 6: 'szóstej', + 7: 'siódmej', + 8: 'ósmej', + 9: 'dziewiątej', + 10: 'dziesięcio', + 11: 'jedenasto', + 12: 'dwunasto', + 13: 'trzynasto', + 14: 'czternasto', + 15: 'piętnasto', + 16: 'szesnasto', + 17: 'siedemnasto', + 18: 'osiemnasto', + 19: 'dziewiętnasto', + 20: 'dwudziesto', + 30: 'trzydziesto', + 40: 'czterdziesto', + 50: 'pięćdziesiecio', + 60: 'sześćdziesięcio', + 70: 'siedemdziesięcio', + 80: 'osiemdziesięcio', + 90: 'dziewięćdziesięcio', +} + +_TIME_UNITS_CONVERSION = { + 'mikrosekund': 'microseconds', + 'mikrosekundy': 'microseconds', + 'milisekund': 'milliseconds', + 'milisekundy': 'milliseconds', + 'sekunda': 'seconds', + 'sekundy': 'seconds', + 'sekund': 'seconds', + 'minuta': 'minutes', + 'minuty': 'minutes', + 'minut': 'minutes', + 'godzina': 'hours', + 'godziny': 'hours', + 'godzin': 'hours', + 'dzień': 'days', + 'dni': 'days', + 'tydzień': 'weeks', + 'tygodni': 'weeks', + 'tygodnie': 'weeks', + 'tygodniu': 'weeks', +} + +_TIME_UNITS_NORMALIZATION = { + 'mikrosekunda': 'mikrosekunda', + 'mikrosekundę': 'mikrosekunda', + 'mikrosekund': 'mikrosekunda', + 'mikrosekundy': 'mikrosekunda', + 'milisekunda': 'milisekunda', + 'milisekundę': 'milisekunda', + 'milisekund': 'milisekunda', + 'milisekundy': 'milisekunda', + 'sekunda': 'sekunda', + 'sekundę': 'sekunda', + 'sekundy': 'sekunda', + 'sekund': 'sekunda', + 'minuta': 'minuta', + 'minutę': 'minuta', + 'minut': 'minuta', + 'minuty': 'minuta', + 'godzina': 'godzina', + 'godzinę': 'godzina', + 'godzin': 'godzina', + 'godziny': 'godzina', + 'dzień': 'dzień', + 'dni': 'dzień', + 'tydzień': 'tydzień', + 'tygodni': 'tydzień', + 'tygodnie': 'tydzień', + 'tygodniu': 'tydzień', + 'miesiąc': 'miesiąc', + 'miesiące': 'miesiąc', + 'miesięcy': 'miesiąc', + 'rok': 'rok', + 'lata': 'rok', + 'lat': 'rok', + 'dekada': 'dekada', + 'dekad': 'dekada', + 'dekady': 'dekada', + 'dekadę': 'dekada', + 'wiek': 'wiek', + 'wieki': 'wiek', + 'milenia': 'milenia', + 'milenium': 'milenia', +} + +_MONTHS_TO_EN = { + 'styczeń': 'January', + 'stycznia': 'January', + 'luty': 'February', + 'lutego': 'February', + 'marzec': 'March', + 'marca': 'March', + 'kwiecień': 'April', + 'kwietnia': 'April', + 'maj': 'May', + 'maja': 'May', + 'czerwiec': 'June', + 'czerwca': 'June', + 'lipiec': 'July', + 'lipca': 'July', + 'sierpień': 'August', + 'sierpnia': 'August', + 'wrzesień': 'September', + 'września': 'September', + 'październik': 'October', + 'października': 'October', + 'listopad': 'November', + 'listopada': 'November', + 'grudzień': 'December', + 'grudnia': 'December', +} + +_DAYS_TO_EN = { + 'poniedziałek': 0, + 'poniedziałkach': 0, + 'poniedziałkami': 0, + 'poniedziałki': 0, + 'poniedziałkiem': 0, + 'poniedziałkom': 0, + 'poniedziałkowa': 0, + 'poniedziałkową': 0, + 'poniedziałkowe': 0, + 'poniedziałkowego': 0, + 'poniedziałkowej': 0, + 'poniedziałkowemu': 0, + 'poniedziałkowi': 0, + 'poniedziałkowy': 0, + 'poniedziałkowych': 0, + 'poniedziałkowym': 0, + 'poniedziałkowymi': 0, + 'poniedziałków': 0, + 'poniedziałku': 0, + 'wtorek': 1, + 'wtorkach': 1, + 'wtorkami': 1, + 'wtorki': 1, + 'wtorkiem': 1, + 'wtorkom': 1, + 'wtorkowa': 1, + 'wtorkową': 1, + 'wtorkowe': 1, + 'wtorkowego': 1, + 'wtorkowej': 1, + 'wtorkowemu': 1, + 'wtorkowi': 1, + 'wtorkowy': 1, + 'wtorkowych': 1, + 'wtorkowym': 1, + 'wtorkowymi': 1, + 'wtorków': 1, + 'wtorku': 1, + 'środa': 2, + 'środach': 2, + 'środami': 2, + 'środą': 2, + 'środę': 2, + 'środo': 2, + 'środom': 2, + 'środowa': 2, + 'środową': 2, + 'środowe': 2, + 'środowego': 2, + 'środowej': 2, + 'środowemu': 2, + 'środowi': 2, + 'środowy': 2, + 'środowych': 2, + 'środowym': 2, + 'środowymi': 2, + 'środy': 2, + 'środzie': 2, + 'śród': 2, + 'czwartek': 3, + 'czwartkach': 3, + 'czwartkami': 3, + 'czwartki': 3, + 'czwartkiem': 3, + 'czwartkom': 3, + 'czwartkowa': 3, + 'czwartkową': 3, + 'czwartkowe': 3, + 'czwartkowego': 3, + 'czwartkowej': 3, + 'czwartkowemu': 3, + 'czwartkowi': 3, + 'czwartkowy': 3, + 'czwartkowych': 3, + 'czwartkowym': 3, + 'czwartkowymi': 3, + 'czwartków': 3, + 'czwartku': 3, + 'piątek': 4, + 'piątkach': 4, + 'piątkami': 4, + 'piątki': 4, + 'piątkiem': 4, + 'piątkom': 4, + 'piątkowa': 4, + 'piątkową': 4, + 'piątkowe': 4, + 'piątkowego': 4, + 'piątkowej': 4, + 'piątkowemu': 4, + 'piątkowi': 4, + 'piątkowy': 4, + 'piątkowych': 4, + 'piątkowym': 4, + 'piątkowymi': 4, + 'piątków': 4, + 'piątku': 4, + 'sobocie': 5, + 'sobota': 5, + 'sobotach': 5, + 'sobotami': 5, + 'sobotą': 5, + 'sobotę': 5, + 'sobotni': 5, + 'sobotnia': 5, + 'sobotnią': 5, + 'sobotnich': 5, + 'sobotnie': 5, + 'sobotniego': 5, + 'sobotniej': 5, + 'sobotniemu': 5, + 'sobotnim': 5, + 'sobotnimi': 5, + 'soboto': 5, + 'sobotom': 5, + 'soboty': 5, + 'sobót': 5, + 'niedziel': 6, + 'niedziela': 6, + 'niedzielach': 6, + 'niedzielami': 6, + 'niedzielą': 6, + 'niedziele': 6, + 'niedzielę': 6, + 'niedzieli': 6, + 'niedzielna': 6, + 'niedzielną': 6, + 'niedzielne': 6, + 'niedzielnego': 6, + 'niedzielnej': 6, + 'niedzielnemu': 6, + 'niedzielni': 6, + 'niedzielny': 6, + 'niedzielnych': 6, + 'niedzielnym': 6, + 'niedzielnymi': 6, + 'niedzielo': 6, + 'niedzielom': 6 +} diff --git a/lingua_franca/lang/common_data_pt.py b/lingua_franca/lang/common_data_pt.py new file mode 100644 index 0000000..ea86d94 --- /dev/null +++ b/lingua_franca/lang/common_data_pt.py @@ -0,0 +1,135 @@ +_FUNCTION_NOT_IMPLEMENTED_WARNING = "esta função não foi implementada em 'pt'" + +# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, +# in PT, "um cavalo" means "a horse" or "one horse". + +_ARTICLES_PT = ["o", "a", "os", "as"] + +# word rules for gender +_FEMALE_ENDINGS_PT = ["a", "as"] +_MALE_ENDINGS_PT = ["o", "os"] + +# special cases, word lookup for words not covered by above rule +_GENDERS_PT = { + "mulher": "f", + "mulheres": "f", + "homem": "m" +} + +# context rules for gender +_MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"] +_FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"] + +_NUMBERS_PT = { + "zero": 0, + "um": 1, + "uma": 1, + "uns": 1, + "umas": 1, + "primeiro": 1, + "segundo": 2, + "terceiro": 3, + "dois": 2, + "duas": 2, + "tres": 3, + "três": 3, + "quatro": 4, + "cinco": 5, + "seis": 6, + "sete": 7, + "oito": 8, + "nove": 9, + "dez": 10, + "onze": 11, + "doze": 12, + "treze": 13, + "catorze": 14, + "quinze": 15, + "dezasseis": 16, + "dezassete": 17, + "dezoito": 18, + "dezanove": 19, + "vinte": 20, + "trinta": 30, + "quarenta": 40, + "cinquenta": 50, + "sessenta": 60, + "setenta": 70, + "oitenta": 80, + "noventa": 90, + "cem": 100, + "cento": 100, + "duzentos": 200, + "duzentas": 200, + "trezentos": 300, + "trezentas": 300, + "quatrocentos": 400, + "quatrocentas": 400, + "quinhentos": 500, + "quinhentas": 500, + "seiscentos": 600, + "seiscentas": 600, + "setecentos": 700, + "setecentas": 700, + "oitocentos": 800, + "oitocentas": 800, + "novecentos": 900, + "novecentas": 900, + "mil": 1000, + "milh�o": 1000000} + +_FRACTION_STRING_PT = { + 2: 'meio', + 3: 'terço', + 4: 'quarto', + 5: 'quinto', + 6: 'sexto', + 7: 'sétimo', + 8: 'oitavo', + 9: 'nono', + 10: 'décimo', + 11: 'onze avos', + 12: 'doze avos', + 13: 'treze avos', + 14: 'catorze avos', + 15: 'quinze avos', + 16: 'dezasseis avos', + 17: 'dezassete avos', + 18: 'dezoito avos', + 19: 'dezanove avos', + 20: 'vigésimo', + 30: 'trigésimo', + 100: 'centésimo', + 1000: 'milésimo' +} + +_NUM_STRING_PT = { + 0: 'zero', + 1: 'um', + 2: 'dois', + 3: 'três', + 4: 'quatro', + 5: 'cinco', + 6: 'seis', + 7: 'sete', + 8: 'oito', + 9: 'nove', + 10: 'dez', + 11: 'onze', + 12: 'doze', + 13: 'treze', + 14: 'catorze', + 15: 'quinze', + 16: 'dezasseis', + 17: 'dezassete', + 18: 'dezoito', + 19: 'dezanove', + 20: 'vinte', + 30: 'trinta', + 40: 'quarenta', + 50: 'cinquenta', + 60: 'sessenta', + 70: 'setenta', + 80: 'oitenta', + 90: 'noventa' +} diff --git a/lingua_franca/lang/common_data_ru.py b/lingua_franca/lang/common_data_ru.py new file mode 100644 index 0000000..dfa795f --- /dev/null +++ b/lingua_franca/lang/common_data_ru.py @@ -0,0 +1,304 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +_NUM_STRING_RU = { + 0: 'ноль', + 1: 'один', + 2: 'два', + 3: 'три', + 4: 'четыре', + 5: 'пять', + 6: 'шесть', + 7: 'семь', + 8: 'восемь', + 9: 'девять', + 10: 'десять', + 11: 'одиннадцать', + 12: 'двенадцать', + 13: 'тринадцать', + 14: 'четырнадцать', + 15: 'пятнадцать', + 16: 'шестнадцать', + 17: 'семнадцать', + 18: 'восемнадцать', + 19: 'девятнадцать', + 20: 'двадцать', + 30: 'тридцать', + 40: 'сорок', + 50: 'пятьдесят', + 60: 'шестьдесят', + 70: 'семьдесят', + 80: 'восемьдесят', + 90: 'девяносто', + 100: 'сто', + 200: 'двести', + 300: 'триста', + 400: 'четыреста', + 500: 'пятьсот', + 600: 'шестьсот', + 700: 'семьсот', + 800: 'восемьсот', + 900: 'девятьсот' +} + + +_FRACTION_STRING_RU = { + 2: 'половина', + 3: 'треть', + 4: 'четверть', + 5: 'пятая', + 6: 'шестая', + 7: 'седьмая', + 8: 'восьмая', + 9: 'девятая', + 10: 'десятая', + 11: 'одиннадцатая', + 12: 'двенадцатая', + 13: 'тринадцатая', + 14: 'четырнадцатая', + 15: 'пятнадцатая', + 16: 'шестнадцатая', + 17: 'семнадцатая', + 18: 'восемнадцатая', + 19: 'девятнадцатая', + 20: 'двадцатая', + 30: 'тридцатая', + 40: 'сороковая', + 50: 'пятидесятая', + 60: 'шестидесятая', + 70: 'семидесятая', + 80: 'восьмидесятая', + 90: 'девяностая', + 1e2: 'сотая', + 1e3: 'тысячная', + 1e6: 'миллионная', + 1e9: 'миллиардная' +} + + +_SHORT_SCALE_RU = OrderedDict([ + (1e3, 'тысяча'), + (1e6, "миллион"), + (1e9, "миллиард"), + (1e12, "триллион"), + (1e15, "квадриллион"), + (1e18, "квинтиллион"), + (1e21, "секстиллион"), + (1e24, "септиллион"), + (1e27, "октиллион"), + (1e30, "нониллион"), + (1e33, "дециллион"), + (1e36, "ундециллион"), + (1e39, "дуодециллион"), + (1e42, "тредециллион"), + (1e45, "кваттордециллион"), + (1e48, "квиндециллион"), + (1e51, "сексдециллион"), + (1e54, "септендециллион"), + (1e57, "октодециллион"), + (1e60, "новемдециллион"), + (1e63, "вигинтиллион"), + (1e66, "унвигинтиллион"), + (1e69, "дуовигинтиллион"), + (1e72, "тревигинтиллион"), + (1e75, "кватторвигинтиллион"), + (1e78, "квинвигинтиллион"), + (1e81, "секснвигинтиллион"), + (1e84, "септенвигинтиллион"), + (1e87, "октовигинтиллион"), + (1e90, "новемвигинтиллион"), + (1e93, "тригинтиллион"), +]) + + +_LONG_SCALE_RU = OrderedDict([ + (1e3, 'тысяча'), + (1e6, "миллион"), + (1e9, "миллиард"), + (1e12, "биллион"), + (1e15, "биллиард"), + (1e18, "триллион"), + (1e21, "триллиард"), + (1e24, "квадриллион"), + (1e27, "квадриллиард"), + (1e30, "квинтиллион"), + (1e33, "квинтиллиард"), + (1e36, "секстиллион"), + (1e39, "секстиллиард"), + (1e42, "септиллион"), + (1e45, "септиллиард"), + (1e48, "октиллион"), + (1e51, "октиллиард"), + (1e54, "нониллион"), + (1e57, "нониллиард"), + (1e60, "дециллион"), + (1e63, "дециллиард"), + (1e66, "ундециллион"), + (1e72, "дуодециллион"), + (1e78, "тредециллион"), + (1e84, "кваттордециллион"), + (1e90, "квиндециллион"), + (1e96, "сексдециллион"), + (1e102, "септендециллион"), + (1e108, "октодециллион"), + (1e114, "новемдециллион"), + (1e120, "вигинтиллион"), +]) + + +_ORDINAL_BASE_RU = { + 1: 'первый', + 2: 'второй', + 3: 'третий', + 4: 'четвёртый', + 5: 'пятый', + 6: 'шестой', + 7: 'седьмой', + 8: 'восьмой', + 9: 'девятый', + 10: 'десятый', + 11: 'одиннадцатый', + 12: 'двенадцатый', + 13: 'тринадцатый', + 14: 'четырнадцатый', + 15: 'пятнадцатый', + 16: 'шестнадцатый', + 17: 'семнадцатый', + 18: 'восемнадцатый', + 19: 'девятнадцатый', + 20: 'двадцатый', + 30: 'тридцатый', + 40: "сороковой", + 50: "пятидесятый", + 60: "шестидесятый", + 70: "семидесятый", + 80: "восьмидесятый", + 90: "девяностый", + 1e2: "сотый", + 2e2: "двухсотый", + 3e2: "трёхсотый", + 4e2: "четырёхсотый", + 5e2: "пятисотый", + 6e2: "шестисотый", + 7e2: "семисотый", + 8e2: "восьмисотый", + 9e2: "девятисотый", + 1e3: "тысячный" +} + + +_SHORT_ORDINAL_RU = { + 1e6: "миллион", + 1e9: "миллиард", + 1e12: "триллион", + 1e15: "квадриллион", + 1e18: "квинтиллион", + 1e21: "секстиллион", + 1e24: "септиллион", + 1e27: "октиллион", + 1e30: "нониллион", + 1e33: "дециллион", + 1e36: "ундециллион", + 1e39: "дуодециллион", + 1e42: "тредециллион", + 1e45: "кваттордециллион", + 1e48: "квиндециллион", + 1e51: "сексдециллион", + 1e54: "септендециллион", + 1e57: "октодециллион", + 1e60: "новемдециллион", + 1e63: "вигинтиллион" +} +_SHORT_ORDINAL_RU.update(_ORDINAL_BASE_RU) + + +_LONG_ORDINAL_RU = { + 1e6: "миллион", + 1e9: "миллиард", + 1e12: "биллион", + 1e15: "биллиард", + 1e18: "триллион", + 1e21: "триллиард", + 1e24: "квадриллион", + 1e27: "квадриллиард", + 1e30: "квинтиллион", + 1e33: "квинтиллиард", + 1e36: "секстиллион", + 1e39: "секстиллиард", + 1e42: "септиллион", + 1e45: "септиллиард", + 1e48: "октиллион", + 1e51: "октиллиард", + 1e54: "нониллион", + 1e57: "нониллиард", + 1e60: "дециллион", + 1e63: "дециллиард", + 1e66: "ундециллион", + 1e72: "дуодециллион", + 1e78: "тредециллион", + 1e84: "кваттордециллион", + 1e90: "квиндециллион", + 1e96: "сексдециллион", + 1e102: "септендециллион", + 1e108: "октодециллион", + 1e114: "новемдециллион", + 1e120: "вигинтиллион" +} +_LONG_ORDINAL_RU.update(_ORDINAL_BASE_RU) + +# Months + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_RU = ['январь', 'февраль', 'март', 'апрель', 'май', 'июнь', + 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', + 'декабрь'] + +# Time +_TIME_UNITS_CONVERSION = { + 'микросекунд': 'microseconds', + 'милисекунд': 'milliseconds', + 'секунда': 'seconds', + 'секунды': 'seconds', + 'секунд': 'seconds', + 'минута': 'minutes', + 'минуты': 'minutes', + 'минут': 'minutes', + 'час': 'hours', + 'часа': 'hours', + 'часов': 'hours', + 'день': 'days', + 'дня': 'days', + 'дней': 'days', + 'неделя': 'weeks', + 'недели': 'weeks', + 'недель': 'weeks' +} diff --git a/lingua_franca/lang/common_data_sl.py b/lingua_franca/lang/common_data_sl.py new file mode 100644 index 0000000..171aa9d --- /dev/null +++ b/lingua_franca/lang/common_data_sl.py @@ -0,0 +1,173 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +_ARTICLES_SL = {} + + +_NUM_STRING_SL = { + 0: 'nič', + 1: 'ena', + 2: 'dve', + 3: 'tri', + 4: 'štiri', + 5: 'pet', + 6: 'šest', + 7: 'sedem', + 8: 'osem', + 9: 'devet', + 10: 'deset', + 11: 'enajst', + 12: 'dvanajst', + 13: 'trinajst', + 14: 'štirinajst', + 15: 'petnajst', + 16: 'šestnajst', + 17: 'sedemnajst', + 18: 'osemnajst', + 19: 'devetnajst', + 20: 'dvajset', + 30: 'trideset', + 40: 'štirideset', + 50: 'petdeset', + 60: 'šestdeset', + 70: 'sedemdeset', + 80: 'osemdeset', + 90: 'devetdeset' +} + + +_FRACTION_STRING_SL = { + 2: 'polovica', + 3: 'tretjina', + 4: 'četrtina', + 5: 'petina', + 6: 'šestina', + 7: 'sedmina', + 8: 'osmina', + 9: 'devetina', + 10: 'desetina', + 11: 'enajstina', + 12: 'dvanajstina', + 13: 'trinajstina', + 14: 'štirinajstina', + 15: 'petnajstina', + 16: 'šestnajstina', + 17: 'sedemnajstina', + 18: 'osemnajstina', + 19: 'devetnajstina', + 20: 'dvajsetina' +} + + +_LONG_SCALE_SL = OrderedDict([ + (100, 'sto'), + (1000, 'tisoč'), + (1000000, 'milijon'), + (1e12, 'bilijon'), + (1e18, 'trilijon'), + (1e24, 'kvadrilijon'), + (1e30, 'kvintilijon'), + (1e36, 'sekstilijon'), + (1e42, 'septilijon'), + (1e48, 'oktilijon'), + (1e54, 'nonilijon'), + (1e60, 'decilijon') + # TODO > 1e63 +]) + + +_SHORT_SCALE_SL = OrderedDict([ + (100, 'sto'), + (1000, 'tisoč'), + (1000000, 'milijon'), + (1e9, 'bilijon'), + (1e12, 'trilijon'), + (1e15, 'kvadrilijon'), + (1e18, 'kvintilijon'), + (1e21, 'sekstilijon'), + (1e24, 'septilijon'), + (1e27, 'oktilijon'), + (1e30, 'nonilijon'), + (1e33, 'decilijon') + # TODO > 1e33 +]) + + +_ORDINAL_BASE_SL = { + 1: 'prvi', + 2: 'drugi', + 3: 'tretji', + 4: 'četrti', + 5: 'peti', + 6: 'šesti', + 7: 'sedmi', + 8: 'osmi', + 9: 'deveti', + 10: 'deseti', + 11: 'enajsti', + 12: 'dvanajsti', + 13: 'trinajsti', + 14: 'štirinajsti', + 15: 'petnajsti', + 16: 'šestnajsti', + 17: 'sedemnajsti', + 18: 'osemnajsti', + 19: 'devetnajsti', + 20: 'dvajseti', + 30: 'trideseti', + 40: 'štirideseti', + 50: 'petdeseti', + 60: 'šestdeseti', + 70: 'sedemdeseti', + 80: 'osemdeseti', + 90: 'devetdeseti', + 1e2: 'stoti', + 1e3: 'tisoči' +} + + +_LONG_ORDINAL_SL = { + 1e6: 'milijonti', + 1e12: 'bilijonti', + 1e18: 'trilijonti', + 1e24: 'kvadrilijonti', + 1e30: 'kvintiljonti', + 1e36: 'sekstilijonti', + 1e42: 'septilijonti', + 1e48: 'oktilijonti', + 1e54: 'nonilijonti', + 1e60: 'decilijonti' + # TODO > 1e60 +} +_LONG_ORDINAL_SL.update(_ORDINAL_BASE_SL) + + +_SHORT_ORDINAL_SL = { + 1e6: 'milijonti', + 1e9: 'bilijonti', + 1e12: 'trilijonti', + 1e15: 'kvadrilijonti', + 1e18: 'kvintiljonti', + 1e21: 'sekstilijonti', + 1e24: 'septilijonti', + 1e27: 'oktilijonti', + 1e30: 'nonilijonti', + 1e33: 'decilijonti' + # TODO > 1e33 +} +_SHORT_ORDINAL_SL.update(_ORDINAL_BASE_SL) diff --git a/lingua_franca/lang/common_data_sv.py b/lingua_franca/lang/common_data_sv.py new file mode 100644 index 0000000..81139bf --- /dev/null +++ b/lingua_franca/lang/common_data_sv.py @@ -0,0 +1,72 @@ +_FUNCTION_NOT_IMPLEMENTED_WARNING = "Denna funktion har inte implementerats i 'sv'" + +_MONTHS_SV = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', + 'juli', 'augusti', 'september', 'oktober', 'november', + 'december'] + +_NUM_STRING_SV = { + 0: 'noll', + 1: 'en', + 2: 'två', + 3: 'tre', + 4: 'fyra', + 5: 'fem', + 6: 'sex', + 7: 'sju', + 8: 'åtta', + 9: 'nio', + 10: 'tio', + 11: 'elva', + 12: 'tolv', + 13: 'tretton', + 14: 'fjorton', + 15: 'femton', + 16: 'sexton', + 17: 'sjutton', + 18: 'arton', + 19: 'nitton', + 20: 'tjugo', + 30: 'trettio', + 40: 'fyrtio', + 50: 'femtio', + 60: 'sextio', + 70: 'sjuttio', + 80: 'åttio', + 90: 'nittio', + 100: 'hundra' +} + +_NUM_POWERS_OF_TEN_SV = [ + 'hundra', + 'tusen', + 'miljon', + 'miljard', + 'biljon', + 'biljard', + 'triljon', + 'triljard' +] + +_FRACTION_STRING_SV = { + 2: 'halv', + 3: 'tredjedel', + 4: 'fjärdedel', + 5: 'femtedel', + 6: 'sjättedel', + 7: 'sjundedel', + 8: 'åttondel', + 9: 'niondel', + 10: 'tiondel', + 11: 'elftedel', + 12: 'tolftedel', + 13: 'trettondel', + 14: 'fjortondel', + 15: 'femtondel', + 16: 'sextondel', + 17: 'sjuttondel', + 18: 'artondel', + 19: 'nittondel', + 20: 'tjugondel' +} + +_EXTRA_SPACE_SV = " " diff --git a/lingua_franca/lang/format_ca.py b/lingua_franca/lang/format_ca.py new file mode 100644 index 0000000..b3dc265 --- /dev/null +++ b/lingua_franca/lang/format_ca.py @@ -0,0 +1,596 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_ca import _FRACTION_STRING_CA, \ + _NUM_STRING_CA +from lingua_franca.internal import lookup_variant +from enum import IntEnum + + +class TimeVariantCA(IntEnum): + DEFAULT = 0 + BELL = 1 + FULL_BELL = 2 + SPANISH_LIKE = 3 + + +def nice_number_ca(number, speech, denominators=range(1, 21)): + """ Catalan helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 i mig" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + # denominador + den_str = _FRACTION_STRING_CA[den] + # fraccions + if whole == 0: + if num == 1: + # un desè + return_string = 'un {}'.format(den_str) + else: + # tres mig + return_string = '{} {}'.format(num, den_str) + # inteiros >10 + elif num == 1: + # trenta-un + return_string = '{}-{}'.format(whole, den_str) + # inteiros >10 com fracções + else: + # vint i 3 desens + return_string = '{} i {} {}'.format(whole, num, den_str) + # plural + if num > 1: + return_string += 's' + return return_string + + +def pronounce_number_ca(number, places=2): + """ + Convert a number to it's spoken equivalent + For example, '5.2' would return 'cinc coma dos' + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + Returns: + (str): The pronounced number + """ + if abs(number) >= 100: + # TODO: Support n > 100 + return str(number) + + result = "" + if number < 0: + result = "menys " + number = abs(number) + + if number >= 20: + tens = int(number - int(number) % 10) + ones = int(number - tens) + result += _NUM_STRING_CA[tens] + if ones > 0: + if tens == 20: + result += "-i-" + _NUM_STRING_CA[ones] + else: + result += "-" + _NUM_STRING_CA[ones] + else: + result += _NUM_STRING_CA[int(number)] + + # Deal with decimal part, in Catalan is commonly used the comma + # instead the dot. Decimal part can be written both with comma + # and dot, but when pronounced, its pronounced "coma" + if not number == int(number) and places > 0: + if abs(number) < 1.0 and (result == "menys " or not result): + result += "zero" + result += " coma" + _num_str = str(number) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + _NUM_STRING_CA[int(char)] + return result + + +@lookup_variant({ + "default": TimeVariantCA.DEFAULT, + "traditional": TimeVariantCA.FULL_BELL, + "bell": TimeVariantCA.BELL, + "full_bell": TimeVariantCA.FULL_BELL, + "spanish": TimeVariantCA.SPANISH_LIKE +}) +def nice_time_ca(dt, speech=True, use_24hour=False, use_ampm=False, + variant=None): + """ + Format a time to a comfortable human format + For example, generate 'cinc trenta' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + variant = variant or TimeVariantCA.DEFAULT + + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if variant == TimeVariantCA.BELL: + # Bell Catalan Time System + # https://en.wikipedia.org/wiki/Catalan_time_system + + if dt.minute < 7: + next_hour = False + elif dt.minute == 7 or dt.minute == 8: + speak += "mig quart" + next_hour = True + elif dt.minute < 15: + next_hour = False + elif dt.minute == 15: + speak += "un quart" + next_hour = True + elif dt.minute == 16: + speak += "un quart i un minut" + next_hour = True + elif dt.minute < 21: + speak += "un quart i " + pronounce_number_ca( + dt.minute - 15) + " minuts" + next_hour = True + elif dt.minute == 22 or dt.minute == 23: + speak += "un quart i mig" + next_hour = True + elif dt.minute < 30: + speak += "un quart i " + pronounce_number_ca( + dt.minute - 15) + " minuts" + next_hour = True + elif dt.minute == 30: + speak += "dos quarts" + next_hour = True + elif dt.minute == 31: + speak += "dos quarts i un minut" + next_hour = True + elif dt.minute < 37: + speak += "dos quarts i " + pronounce_number_ca( + dt.minute - 30) + " minuts" + next_hour = True + elif dt.minute == 37 or dt.minute == 38: + speak += "dos quarts i mig" + next_hour = True + elif dt.minute < 45: + speak += "dos quarts i " + pronounce_number_ca( + dt.minute - 30) + " minuts" + next_hour = True + elif dt.minute == 45: + speak += "tres quarts" + next_hour = True + elif dt.minute == 46: + speak += "tres quarts i un minut" + next_hour = True + elif dt.minute < 52: + speak += "tres quarts i " + pronounce_number_ca( + dt.minute - 45) + " minuts" + next_hour = True + elif dt.minute == 52 or dt.minute == 53: + speak += "tres quarts i mig" + next_hour = True + elif dt.minute > 53: + speak += "tres quarts i " + pronounce_number_ca( + dt.minute - 45) + " minuts" + next_hour = True + + if next_hour == True: + next_hour = (dt.hour + 1) % 12 + if next_hour == 0: + speak += " de dotze" + if dt.hour == 11: + speak += " del migdia" + else: + speak += " de la nit" + + elif next_hour == 1: + speak += " d'una" + if dt.hour == 12: + speak += " de la tarda" + else: + speak += " de la matinada" + elif next_hour == 2: + speak += "de dues" + if dt.hour == 13: + speak += " de la tarda" + else: + speak += " de la nit" + + elif next_hour == 11: + speak += "d'onze" + if dt.hour == 22: + speak += " de la nit" + else: + speak += " del matí" + else: + speak += "de " + pronounce_number_ca(next_hour) + if dt.hour == 0 and dt.hour < 5: + speak += " de la matinada" + elif dt.hour >= 5 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 11: + speak += " del migdia" + elif dt.hour >= 12 and dt.hour <= 17: + speak += " de la tarda" + elif dt.hour >= 18 and dt.hour < 20: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + + else: + hour = dt.hour % 12 + if hour == 0: + speak += "les dotze" + elif hour == 1: + speak += "la una" + elif hour == 2: + speak += "les dues" + else: + speak += "les " + pronounce_number_ca(hour) + + if dt.minute == 0: + speak += " en punt" + elif dt.minute == 1: + speak += " i un minut" + else: + speak += " i " + pronounce_number_ca(dt.minute) + " minuts" + + if dt.hour == 0: + speak += " de la nit" + elif dt.hour >= 1 and dt.hour < 6: + speak += " de la matinada" + elif dt.hour >= 6 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 12: + speak += " del migdia" + elif dt.hour >= 13 and dt.hour < 19: + speak += " de la tarda" + elif dt.hour >= 19 and dt.hour < 21: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + elif variant == TimeVariantCA.FULL_BELL: + # Full Bell Catalan Time System + # https://en.wikipedia.org/wiki/Catalan_time_system + + if dt.minute < 2: + # en punt + next_hour = False + if dt.minute < 5: + # tocades + next_hour = False + elif dt.minute < 7: + # ben tocades + next_hour = False + elif dt.minute < 9: + # mig quart + speak += "mig quart" + next_hour = True + elif dt.minute < 12: + # mig quart passat + speak += "mig quart passat" + next_hour = True + elif dt.minute < 14: + # mig quart passat + speak += "mig quart ben passat" + next_hour = True + elif dt.minute < 17: + speak += "un quart" + next_hour = True + elif dt.minute < 20: + speak += "un quart tocat" + next_hour = True + elif dt.minute < 22: + speak += "un quart ben tocat" + next_hour = True + elif dt.minute < 24: + speak += "un quart i mig" + next_hour = True + elif dt.minute < 27: + speak += "un quart i mig passat" + next_hour = True + elif dt.minute < 29: + speak += "un quart i mig ben passat" + next_hour = True + elif dt.minute < 32: + speak += "dos quarts" + next_hour = True + elif dt.minute < 35: + speak += "dos quarts tocats" + next_hour = True + elif dt.minute < 37: + speak += "dos quarts ben tocats" + next_hour = True + elif dt.minute < 39: + speak += "dos quarts i mig" + next_hour = True + elif dt.minute < 42: + speak += "dos quarts i mig passats" + next_hour = True + elif dt.minute < 44: + speak += "dos quarts i mig ben passats" + next_hour = True + elif dt.minute < 47: + speak += "tres quarts" + next_hour = True + elif dt.minute < 50: + speak += "tres quarts tocats" + next_hour = True + elif dt.minute < 52: + speak += "tres quarts ben tocats" + next_hour = True + elif dt.minute < 54: + speak += "tres quarts i mig" + next_hour = True + elif dt.minute < 57: + speak += "tres quarts i mig passats" + next_hour = True + elif dt.minute < 59: + speak += "tres quarts i mig ben passats" + next_hour = True + elif dt.minute == 59: + next_hour = False + + if next_hour == True: + next_hour = (dt.hour + 1) % 12 + if next_hour == 0: + speak += " de dotze" + if dt.hour == 11: + speak += " del migdia" + else: + speak += " de la nit" + + elif next_hour == 1: + speak += " d'una" + if dt.hour == 12: + speak += " de la tarda" + else: + speak += " de la matinada" + elif next_hour == 2: + speak += "de dues" + if dt.hour == 13: + speak += " de la tarda" + else: + speak += " de la nit" + + elif next_hour == 11: + speak += "d'onze" + if dt.hour == 22: + speak += " de la nit" + else: + speak += " del matí" + else: + speak += "de " + pronounce_number_ca(next_hour) + if dt.hour == 0 and dt.hour < 5: + speak += " de la matinada" + elif dt.hour >= 5 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 11: + speak += " del migdia" + elif dt.hour >= 12 and dt.hour <= 17: + speak += " de la tarda" + elif dt.hour >= 18 and dt.hour < 20: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + else: + hour = dt.hour % 12 + if dt.minute == 59: + hour = (hour + 1) % 12 + if hour == 0: + speak += "les dotze" + elif hour == 1: + speak += "la una" + elif hour == 2: + speak += "les dues" + else: + speak += "les " + pronounce_number_ca(hour) + + if dt.minute == 0: + speak += " en punt" + elif dt.minute > 1 and dt.minute < 5: + if hour == 1: + speak += " tocada" + else: + speak += " tocades" + elif dt.minute < 7: + if hour == 1: + speak += " ben tocada" + else: + speak += " ben tocades" + + if dt.hour == 0: + if hour == 1: + speak += " de la matinada" + else: + speak += " de la nit" + elif dt.hour < 6: + if hour == 6: + speak += " del matí" + else: + speak += " de la matinada" + elif dt.hour < 12: + if hour == 12: + speak += " del migdia" + else: + speak += " del matí" + elif dt.hour == 12: + if hour == 1: + speak += " de la tarda" + else: + speak += " del migdia" + elif dt.hour < 19: + if hour == 7: + speak += " del vespre" + else: + speak += " de la tarda" + elif dt.hour < 21: + if hour == 9: + speak += " de la nit" + else: + speak += " del vespre" + elif dt.hour <= 23: + speak += " de la nit" + + elif variant == TimeVariantCA.SPANISH_LIKE: + # Prepare for "tres menys quart" ?? + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0 or hour == 12: + speak += "les dotze" + elif hour == 1 or hour == 13: + speak += "la una" + elif hour < 13: + speak = "les " + pronounce_number_ca(hour) + else: + speak = "les " + pronounce_number_ca(hour - 12) + + if minute != 0: + # les hores especials + if minute == 15: + speak += " i quart" + elif minute == 30: + speak += " i mitja" + elif minute == -15: + speak += " menys quart" + else: # sis i nou. set i veint-i-cinc + if minute > 0: + speak += " i " + pronounce_number_ca(minute) + else: # si son las set menys vint, no posem la "i" + speak += " " + pronounce_number_ca(minute) + + # Default Watch Time Sytem + else: + if use_24hour: + # simply speak the number + if dt.hour == 1: + speak += "la una" + elif dt.hour == 2: + speak += "les dues" + elif dt.hour == 21: + speak += "les vint-i-una" + elif dt.hour == 22: + speak += "les vint-i-dues" + else: + speak += "les " + pronounce_number_ca(dt.hour) + + if dt.minute > 0: + speak += " i " + pronounce_number_ca(dt.minute) + + else: + # speak number and add daytime identifier + # (equivalent to "in the morning") + if dt.hour == 0: + speak += "les dotze" + # 1 and 2 are pronounced in female form when talking about hours + elif dt.hour == 1 or dt.hour == 13: + speak += "la una" + elif dt.hour == 2 or dt.hour == 14: + speak += "les dues" + elif dt.hour < 13: + speak = "les " + pronounce_number_ca(dt.hour) + else: + speak = "les " + pronounce_number_ca(dt.hour - 12) + + # exact time + if dt.minute == 0: + # 3:00 + speak += " en punt" + # else + else: + speak += " i " + pronounce_number_ca(dt.minute) + + # TODO: review day-periods + if use_ampm: + if dt.hour == 0: + speak += " de la nit" + elif dt.hour >= 1 and dt.hour < 6: + speak += " de la matinada" + elif dt.hour >= 6 and dt.hour < 12: + speak += " del matí" + elif dt.hour == 12: + speak += " del migdia" + elif dt.hour >= 13 and dt.hour <= 18: + speak += " de la tarda" + elif dt.hour >= 19 and dt.hour < 21: + speak += " del vespre" + elif dt.hour != 0 and dt.hour != 12: + speak += " de la nit" + return speak diff --git a/lingua_franca/lang/format_common.py b/lingua_franca/lang/format_common.py new file mode 100644 index 0000000..11c065a --- /dev/null +++ b/lingua_franca/lang/format_common.py @@ -0,0 +1,47 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +def convert_to_mixed_fraction(number, denominators=range(1, 21)): + """ + Convert floats to components of a mixed fraction representation + + Returns the closest fractional representation using the + provided denominators. For example, 4.500002 would become + the whole number 4, the numerator 1 and the denominator 2 + + Args: + number (float): number for convert + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + whole, numerator, denominator (int): Integers of the mixed fraction + """ + int_number = int(number) + if int_number == number: + return int_number, 0, 1 # whole number, no fraction + + frac_number = abs(number - int_number) + if not denominators: + denominators = range(1, 21) + + for denominator in denominators: + numerator = abs(frac_number) * denominator + if abs(numerator - round(numerator)) < 0.01: # 0.01 accuracy + break + else: + return None + + return int_number, int(round(numerator)), denominator diff --git a/lingua_franca/lang/format_cs.py b/lingua_franca/lang/format_cs.py new file mode 100644 index 0000000..841a969 --- /dev/null +++ b/lingua_franca/lang/format_cs.py @@ -0,0 +1,389 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_cs import _NUM_STRING_CS, \ + _FRACTION_STRING_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, _LONG_ORDINAL_CS + + +def nice_number_cs(number, speech=True, denominators=range(1, 21)): + """ English helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_CS[den] + if whole == 0: + if num == 1: + return_string = '{}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} a {}'.format(whole, den_str) + else: + return_string = '{} a {} {}'.format(whole, num, den_str) + if num > 4: + return_string = return_string[:-1] + elif num > 1: + return_string = return_string[:-1] + 'y' + + return return_string + + +def pronounce_number_cs(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "nekonečno" + elif num == float("-inf"): + return "záporné nekonečno" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles zápornés of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} krát deset k {}{} mocnině'.format( + 'záporné ' if float(n) < 0 else '', + pronounce_number_cs( + abs(float(n)), places, short_scale, False, ordinals=False), + 'záporné ' if power < 0 else '', + pronounce_number_cs(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles zápornés of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} krát deset na mocninu {}{}'.format( + 'záporné ' if float(n) < 0 else '', + pronounce_number_cs( + abs(float(n)), places, short_scale, False), + 'záporné ' if power < 0 else '', + pronounce_number_cs(abs(power), places, short_scale, False)) + + if short_scale: + number_names = _NUM_STRING_CS.copy() + number_names.update(_SHORT_SCALE_CS) + else: + number_names = _NUM_STRING_CS.copy() + number_names.update(_LONG_SCALE_CS) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_CS[n] for n in _SHORT_SCALE_CS.keys()] + else: + hundreds = [_LONG_SCALE_CS[n] for n in _LONG_SCALE_CS.keys()] + + # deal with zápornés + result = "" + if num < 0: + result = "záporné " if scientific else "mínus " + num = abs(num) + + if not ordinals: + try: + # deal with 4 digits + # usually if it's a 4 digit num it should be said like a date + # i.e. 1972 => nineteen seventy two + if len(str(num)) == 4 and isinstance(num, int): + _num = str(num) + # deal with 1000, 2000, 2001, 2100, 3123, etc + # is skipped as the rest of the + # functin deals with this already + if _num[1:4] == '000' or _num[1:3] == '00' or int(_num[0:2]) >= 20: + pass + # deal with 1900, 1300, etc + # i.e. 1900 => nineteen hundred + elif _num[2:4] == '00': + first = number_names[int(_num[0:2])] + last = number_names[100] + return first + " " + last + # deal with 1960, 1961, etc + # i.e. 1960 => nineteen sixty + # 1961 => nineteen sixty one + else: + first = number_names[int(_num[0:2])] + if _num[3:4] == '0': + last = number_names[int(_num[2:4])] + else: + second = number_names[int(_num[2:3])*10] + last = second + " " + number_names[int(_num[3:4])] + return first + " " + last + # exception used to catch any unforseen edge cases + # will default back to normal subroutine + except Exception as e: + # TODO this probably shouldn't go to stdout + print('ERROR: Exception in pronounce_number_cs: {}' + repr(e)) + + # check for a direct match + if num in number_names and not ordinals: + if num > 90: + result += "jedna " + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_CS and ordinals: + return _SHORT_ORDINAL_CS[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return digits[q] + " sto" + ( + " a " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= max(_SHORT_SCALE_CS.keys()): + return "nekonečno" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, not i and ordi) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + + if i * 1000 in _SHORT_ORDINAL_CS: + if z == 1: + number = _SHORT_ORDINAL_CS[i * 1000] + else: + number += _SHORT_ORDINAL_CS[i * 1000] + else: + if n not in _SHORT_SCALE_CS: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SHORT_SCALE_CS[num] + "tý" + else: + number = _SHORT_SCALE_CS[n] + "tý" + else: + number += hundreds[i] + res.append(number) + ordi = False + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_CS.keys()): + return "nekonečno" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_cs(z, places, True, scientific, + ordinals=ordi and not i) + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + + if ordi: + if i * 1000000 in _LONG_ORDINAL_CS: + if z == 1: + number = _LONG_ORDINAL_CS[ + (i + 1) * 1000000] + else: + number += _LONG_ORDINAL_CS[ + (i + 1) * 1000000] + else: + if n not in _LONG_SCALE_CS: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += " " + _LONG_SCALE_CS[ + num] + "tý" + else: + number = " " + _LONG_SCALE_CS[n] + "tý" + else: + + number += " " + hundreds[i + 1] + res.append(number) + return ", ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_cs(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "mínus " or not result): + result += "nula" + result += " tečka" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_cs(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_cs(int(string[0])) + " " + speak += pronounce_number_cs(int(string[1])) + else: + speak = pronounce_number_cs(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "sto" + else: + if string[3] == '0': + speak += pronounce_number_cs(0) + " " + speak += pronounce_number_cs(int(string[4])) + else: + speak += pronounce_number_cs(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "půlnoc" + elif dt.hour == 12 and dt.minute == 0: + return "poledne" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "čtvrt po " + pronounce_number_cs(hour) + elif dt.minute == 30: + speak = "půl po " + pronounce_number_cs(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "třičtvrtě na " + pronounce_number_cs(next_hour) + else: + speak = pronounce_number_cs(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + " hodin" + else: + if dt.minute < 10: + speak += " oh" + speak += " " + pronounce_number_cs(dt.minute) + + if use_ampm: + if dt.hour > 11: + speak += " p.m." + else: + speak += " a.m." + + return speak diff --git a/lingua_franca/lang/format_da.py b/lingua_franca/lang/format_da.py new file mode 100644 index 0000000..f1ccdcc --- /dev/null +++ b/lingua_franca/lang/format_da.py @@ -0,0 +1,339 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_da import _EXTRA_SPACE_DA, \ + _FRACTION_STRING_DA, _MONTHS_DA, _NUM_POWERS_OF_TEN, _NUM_STRING_DA +from math import floor + + +def nice_number_da(number, speech=True, denominators=range(1, 21)): + """ Danish helper for nice_number + This function formats a float to human understandable functions. Like + 4.5 becomes "4 einhalb" for speech and "4 1/2" for text + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)).replace(".", ",") + whole, num, den = result + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_DA[den] + if whole == 0: + if num == 1: + return_string = '{} {}'.format(num, den_str) + else: + return_string = '{} {}e'.format(num, den_str) + else: + if num == 1: + return_string = '{} og {} {}'.format(whole, num, den_str) + else: + return_string = '{} og {} {}e'.format(whole, num, den_str) + + return return_string + + +def pronounce_number_da(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # TODO short_scale, scientific and ordinals + # currently ignored + + def pronounce_triplet_da(num): + result = "" + num = floor(num) + if num > 99: + hundreds = floor(num / 100) + if hundreds > 0: + if hundreds == 1: + result += 'et' + 'hundrede' + _EXTRA_SPACE_DA + else: + result += _NUM_STRING_DA[hundreds] + \ + 'hundrede' + _EXTRA_SPACE_DA + num -= hundreds * 100 + if num == 0: + result += '' # do nothing + elif num == 1: + result += 'et' + elif num <= 20: + result += _NUM_STRING_DA[num] + _EXTRA_SPACE_DA + elif num > 20: + ones = num % 10 + tens = num - ones + if ones > 0: + result += _NUM_STRING_DA[ones] + _EXTRA_SPACE_DA + if tens > 0: + result += 'og' + _EXTRA_SPACE_DA + if tens > 0: + result += _NUM_STRING_DA[tens] + _EXTRA_SPACE_DA + + return result + + def pronounce_fractional_da(num, places): + # fixed number of places even with trailing zeros + result = "" + place = 10 + while places > 0: + # doesn't work with 1.0001 and places = 2: int( + # number*place) % 10 > 0 and places > 0: + result += " " + _NUM_STRING_DA[int(num * place) % 10] + place *= 10 + places -= 1 + return result + + def pronounce_whole_number_da(num, scale_level=0): + if num == 0: + return '' + + num = floor(num) + result = '' + last_triplet = num % 1000 + + if last_triplet == 1: + if scale_level == 0: + if result != '': + result += '' + 'et' + else: + result += "en" + elif scale_level == 1: + result += 'et' + _EXTRA_SPACE_DA + 'tusinde' + _EXTRA_SPACE_DA + else: + result += "en " + _NUM_POWERS_OF_TEN[scale_level] + ' ' + elif last_triplet > 1: + result += pronounce_triplet_da(last_triplet) + if scale_level == 1: + result += 'tusinde' + _EXTRA_SPACE_DA + if scale_level >= 2: + result += "og" + _NUM_POWERS_OF_TEN[scale_level] + if scale_level >= 2: + if scale_level % 2 == 0: + result += "er" # MillionER + result += "er " # MilliardER, MillioneER + + num = floor(num / 1000) + scale_level += 1 + return pronounce_whole_number_da(num, + scale_level) + result + _EXTRA_SPACE_DA + + result = "" + if abs(number) >= 1000000000000000000000000: # cannot do more than this + return str(number) + elif number == 0: + return str(_NUM_STRING_DA[0]) + elif number < 0: + return "minus " + pronounce_number_da(abs(number), places) + else: + if number == int(number): + return pronounce_whole_number_da(number) + else: + whole_number_part = floor(number) + fractional_part = number - whole_number_part + result += pronounce_whole_number_da(whole_number_part) + if places > 0: + result += " komma" + result += pronounce_fractional_da(fractional_part, places) + return result + + +def pronounce_ordinal_da(number): + """ + This function pronounces a number as an ordinal + + 1 -> first + 2 -> second + + Args: + number (int): the number to format + Returns: + (str): The pronounced number string. + """ + + # ordinals for 1, 3, 7 and 8 are irregular + # this produces the base form, it will have to be adapted for genus, + # casus, numerus + + ordinals = ["nulte", "første", "anden", "tredie", "fjerde", "femte", + "sjette", "syvende", "ottende", "niende", "tiende"] + + # only for whole positive numbers including zero + if number < 0 or number != int(number): + return number + if number < 10: + return ordinals[number] + if number < 30: + if pronounce_number_da(number)[-1:] == 'e': + return pronounce_number_da(number) + "nde" + else: + return pronounce_number_da(number) + "ende" + if number < 40: + return pronounce_number_da(number) + "fte" + else: + if pronounce_number_da(number)[-1:] == 'e': + return pronounce_number_da(number) + "nde" + else: + return pronounce_number_da(number) + "ende" + + +def nice_time_da(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + if dt.hour == 1: + speak += "et" # 01:00 is "et" not "en" + else: + speak += pronounce_number_da(dt.hour) + if not dt.minute == 0: + if dt.minute < 10: + speak += ' nul' + speak += " " + pronounce_number_da(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "midnat" + if dt.hour == 12 and dt.minute == 0: + return "middag" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if dt.hour == 0: + speak += pronounce_number_da(12) + elif dt.hour <= 13: + if dt.hour == 1 or dt.hour == 13: # 01:00 and 13:00 is "et" + speak += 'et' + else: + speak += pronounce_number_da(dt.hour) + else: + speak += pronounce_number_da(dt.hour - 12) + + if not dt.minute == 0: + if dt.minute < 10: + speak += ' nul' + speak += " " + pronounce_number_da(dt.minute) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + # 12:01 - 17:59 nachmittags/afternoon + speak += " om eftermiddagen" + elif dt.hour < 22: + # 18:00 - 21:59 abends/evening + speak += " om aftenen" + else: + # 22:00 - 23:59 nachts/at night + speak += " om natten" + elif dt.hour < 3: + # 00:01 - 02:59 nachts/at night + speak += " om natten" + else: + # 03:00 - 11:59 morgens/in the morning + speak += " om morgenen" + + return speak + + +def nice_response_da(text): + # check for months and call _nice_ordinal_da declension of ordinals + # replace "^" with "hoch" (to the power of) + words = text.split() + + for idx, word in enumerate(words): + if word.lower() in _MONTHS_DA: + text = _nice_ordinal_da(text) + + if word == '^': + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + if wordNext.isnumeric(): + words[idx] = "opløftet i" + text = " ".join(words) + return text + + +def _nice_ordinal_da(text, speech=True): + # check for months for declension of ordinals before months + # depending on articles/prepositions + normalized_text = text + words = text.split() + + for idx, word in enumerate(words): + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordPrev = words[idx - 1] if idx > 0 else "" + if word[-1:] == ".": + if word[:-1].isdecimal(): + if wordNext.lower() in _MONTHS_DA: + word = pronounce_ordinal_da(int(word[:-1])) + if wordPrev.lower() in ["om", "den", "fra", "til", + "(fra", "(om", "til"]: + word += "n" + elif wordPrev.lower() not in ["den"]: + word += "r" + words[idx] = word + normalized_text = " ".join(words) + return normalized_text diff --git a/lingua_franca/lang/format_de.py b/lingua_franca/lang/format_de.py new file mode 100644 index 0000000..9f6c745 --- /dev/null +++ b/lingua_franca/lang/format_de.py @@ -0,0 +1,327 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_de import _EXTRA_SPACE_DE, \ + _FRACTION_STRING_DE, _MONTHS_DE, _NUM_POWERS_OF_TEN_DE, _NUM_STRING_DE +from math import floor + + +def nice_number_de(number, speech=True, denominators=range(1, 21)): + """ German helper for nice_number + This function formats a float to human understandable functions. Like + 4.5 becomes "4 einhalb" for speech and "4 1/2" for text + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)).replace(".", ",") + whole, num, den = result + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_DE[den] + if whole == 0: + if num == 1: + return_string = 'ein {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} und ein {}'.format(whole, den_str) + else: + return_string = '{} und {} {}'.format(whole, num, den_str) + + return return_string + + +def pronounce_number_de(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + + # TODO short_scale, scientific and ordinals + # currently ignored + + def pronounce_triplet_de(num): + result = "" + num = floor(num) + if num > 99: + hundreds = floor(num / 100) + if hundreds > 0: + result += _NUM_STRING_DE[ + hundreds] + _EXTRA_SPACE_DE + 'hundert' + _EXTRA_SPACE_DE + num -= hundreds * 100 + if num == 0: + result += '' # do nothing + elif num == 1: + result += 'eins' # need the s for the last digit + elif num <= 20: + result += _NUM_STRING_DE[num] # + _EXTRA_SPACE_DA + elif num > 20: + ones = num % 10 + tens = num - ones + if ones > 0: + result += _NUM_STRING_DE[ones] + _EXTRA_SPACE_DE + if tens > 0: + result += 'und' + _EXTRA_SPACE_DE + if tens > 0: + result += _NUM_STRING_DE[tens] + _EXTRA_SPACE_DE + return result + + def pronounce_fractional_de(num, + places): # fixed number of places even with + # trailing zeros + result = "" + place = 10 + while places > 0: # doesn't work with 1.0001 and places = 2: int( + # number*place) % 10 > 0 and places > 0: + result += " " + _NUM_STRING_DE[int(num * place) % 10] + if int(num * place) % 10 == 1: + result += 's' # "1" is pronounced "eins" after the decimal + # point + place *= 10 + places -= 1 + return result + + def pronounce_whole_number_de(num, scale_level=0): + if num == 0: + return '' + + num = floor(num) + result = '' + last_triplet = num % 1000 + + if last_triplet == 1: + if scale_level == 0: + if result != '': + result += '' + 'eins' + else: + result += "eins" + elif scale_level == 1: + result += 'ein' + _EXTRA_SPACE_DE + 'tausend' + _EXTRA_SPACE_DE + else: + result += "eine " + _NUM_POWERS_OF_TEN_DE[scale_level] + ' ' + elif last_triplet > 1: + result += pronounce_triplet_de(last_triplet) + if scale_level == 1: + # result += _EXTRA_SPACE_DA + result += 'tausend' + _EXTRA_SPACE_DE + if scale_level >= 2: + # if _EXTRA_SPACE_DA == '': + # result += " " + result += " " + _NUM_POWERS_OF_TEN_DE[scale_level] + if scale_level >= 2: + if scale_level % 2 == 0: + result += "e" # MillionE + result += "n " # MilliardeN, MillioneN + + num = floor(num / 1000) + scale_level += 1 + return pronounce_whole_number_de(num, + scale_level) + result # + _EXTRA_SPACE_DA + + result = "" + if abs(number) >= 1000000000000000000000000: # cannot do more than this + return str(number) + elif number == 0: + return str(_NUM_STRING_DE[0]) + elif number < 0: + return "minus " + pronounce_number_de(abs(number), places) + else: + if number == int(number): + return pronounce_whole_number_de(number) + else: + whole_number_part = floor(number) + fractional_part = number - whole_number_part + result += pronounce_whole_number_de(whole_number_part) + if places > 0: + result += " Komma" + result += pronounce_fractional_de(fractional_part, places) + return result + + +def pronounce_ordinal_de(number): + """ + This function pronounces a number as an ordinal + + 1 -> first + 2 -> second + + Args: + number (int): the number to format + Returns: + (str): The pronounced number string. + """ + # ordinals for 1, 3, 7 and 8 are irregular + # this produces the base form, it will have to be adapted for genus, + # casus, numerus + + ordinals = ["nullte", "erste", "zweite", "dritte", "vierte", "fünfte", + "sechste", "siebte", "achte"] + + # only for whole positive numbers including zero + if number < 0 or number != int(number): + return number + elif number < 9: + return ordinals[number] + elif number < 20: + return pronounce_number_de(number) + "te" + else: + return pronounce_number_de(number) + "ste" + + +def nice_time_de(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if not speech: + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + if dt.hour == 1: + speak += "ein" # 01:00 is "ein Uhr" not "eins Uhr" + else: + speak += pronounce_number_de(dt.hour) + speak += " Uhr" + if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is + # "13 Uhr" not "13 hundred hours" + speak += " " + pronounce_number_de(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "Mitternacht" + elif dt.hour == 12 and dt.minute == 0: + return "Mittag" + elif dt.minute == 15: + # sentence relative to next hour and 0 spoken as 12 + next_hour = (dt.hour + 1) % 12 or 12 + speak = "viertel " + pronounce_number_de(next_hour) + elif dt.minute == 30: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "halb " + pronounce_number_de(next_hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "dreiviertel " + pronounce_number_de(next_hour) + else: + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if hour == 1: # 01:00 and 13:00 is "ein Uhr" not "eins Uhr" + speak += 'ein' + else: + speak += pronounce_number_de(hour) + speak += " Uhr" + + if not dt.minute == 0: + speak += " " + pronounce_number_de(dt.minute) + + if use_ampm: + if 3 <= dt.hour < 12: + speak += " morgens" # 03:00 - 11:59 morgens/in the morning + elif 12 <= dt.hour < 18: + speak += " nachmittags" # 12:01 - 17:59 nachmittags/afternoon + elif 18 <= dt.hour < 22: + speak += " abends" # 18:00 - 21:59 abends/evening + else: + speak += " nachts" # 22:00 - 02:59 nachts/at night + + return speak + + +def nice_response_de(text): + # check for months and call _nice_ordinal_de declension of ordinals + # replace "^" with "hoch" (to the power of) + words = text.split() + + for idx, word in enumerate(words): + if word.lower() in _MONTHS_DE: + text = _nice_ordinal_de(text) + + if word == '^': + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + if wordNext.isnumeric(): + words[idx] = "hoch" + text = " ".join(words) + return text + + +def _nice_ordinal_de(text, speech=True): + # check for months for declension of ordinals before months + # depending on articles/prepositions + normalized_text = text + words = text.split() + + for idx, word in enumerate(words): + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordPrev = words[idx - 1] if idx > 0 else "" + if word[-1:] == ".": + if word[:-1].isdecimal(): + if wordNext.lower() in _MONTHS_DE: + word = pronounce_ordinal_de(int(word[:-1])) + if wordPrev.lower() in ["am", "dem", "vom", "zum", + "(vom", "(am", "zum"]: + word += "n" + elif wordPrev.lower() not in ["der", "die", "das"]: + word += "r" + words[idx] = word + normalized_text = " ".join(words) + return normalized_text diff --git a/lingua_franca/lang/format_en.py b/lingua_franca/lang/format_en.py new file mode 100644 index 0000000..c2911fe --- /dev/null +++ b/lingua_franca/lang/format_en.py @@ -0,0 +1,386 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \ + _FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, _LONG_ORDINAL_EN + + +def nice_number_en(number, speech=True, denominators=range(1, 21)): + """ English helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_EN[den] + if whole == 0: + if num == 1: + return_string = 'a {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} and a {}'.format(whole, den_str) + else: + return_string = '{} and {} {}'.format(whole, num, den_str) + if num > 1: + return_string += 's' + return return_string + + +def pronounce_number_en(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "infinity" + elif num == float("-inf"): + return "negative infinity" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} times ten to the {}{} power'.format( + 'negative ' if float(n) < 0 else '', + pronounce_number_en( + abs(float(n)), places, short_scale, False, ordinals=False), + 'negative ' if power < 0 else '', + pronounce_number_en(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} times ten to the power of {}{}'.format( + 'negative ' if float(n) < 0 else '', + pronounce_number_en( + abs(float(n)), places, short_scale, False), + 'negative ' if power < 0 else '', + pronounce_number_en(abs(power), places, short_scale, False)) + + if short_scale: + number_names = _NUM_STRING_EN.copy() + number_names.update(_SHORT_SCALE_EN) + else: + number_names = _NUM_STRING_EN.copy() + number_names.update(_LONG_SCALE_EN) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_EN[n] for n in _SHORT_SCALE_EN.keys()] + else: + hundreds = [_LONG_SCALE_EN[n] for n in _LONG_SCALE_EN.keys()] + + # deal with negatives + result = "" + if num < 0: + result = "negative " if scientific else "minus " + num = abs(num) + + if not ordinals: + try: + # deal with 4 digits + # usually if it's a 4 digit num it should be said like a date + # i.e. 1972 => nineteen seventy two + if len(str(num)) == 4 and isinstance(num, int): + _num = str(num) + # deal with 1000, 2000, 2001, 2100, 3123, etc + # is skipped as the rest of the + # functin deals with this already + if _num[1:4] == '000' or _num[1:3] == '00' or int(_num[0:2]) >= 20: + pass + # deal with 1900, 1300, etc + # i.e. 1900 => nineteen hundred + elif _num[2:4] == '00': + first = number_names[int(_num[0:2])] + last = number_names[100] + return first + " " + last + # deal with 1960, 1961, etc + # i.e. 1960 => nineteen sixty + # 1961 => nineteen sixty one + else: + first = number_names[int(_num[0:2])] + if _num[3:4] == '0': + last = number_names[int(_num[2:4])] + else: + second = number_names[int(_num[2:3])*10] + last = second + " " + number_names[int(_num[3:4])] + return first + " " + last + # exception used to catch any unforseen edge cases + # will default back to normal subroutine + except Exception as e: + # TODO this probably shouldn't go to stdout + print('ERROR: Exception in pronounce_number_en: {}' + repr(e)) + + # check for a direct match + if num in number_names and not ordinals: + if num > 90: + result += "one " + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_EN and ordinals: + return _SHORT_ORDINAL_EN[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return digits[q] + " hundred" + ( + " and " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= max(_SHORT_SCALE_EN.keys()): + return "infinity" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, not i and ordi) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + + if i * 1000 in _SHORT_ORDINAL_EN: + if z == 1: + number = _SHORT_ORDINAL_EN[i * 1000] + else: + number += _SHORT_ORDINAL_EN[i * 1000] + else: + if n not in _SHORT_SCALE_EN: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SHORT_SCALE_EN[num] + "th" + else: + number = _SHORT_SCALE_EN[n] + "th" + else: + number += hundreds[i] + res.append(number) + ordi = False + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_EN.keys()): + return "infinity" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_en(z, places, True, scientific, + ordinals=ordi and not i) + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + + if ordi: + if i * 1000000 in _LONG_ORDINAL_EN: + if z == 1: + number = _LONG_ORDINAL_EN[ + (i + 1) * 1000000] + else: + number += _LONG_ORDINAL_EN[ + (i + 1) * 1000000] + else: + if n not in _LONG_SCALE_EN: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += " " + _LONG_SCALE_EN[ + num] + "th" + else: + number = " " + _LONG_SCALE_EN[n] + "th" + else: + + number += " " + hundreds[i + 1] + res.append(number) + return ", ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_en(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "minus " or not result): + result += "zero" + result += " point" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_en(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_en(int(string[0])) + " " + speak += pronounce_number_en(int(string[1])) + else: + speak = pronounce_number_en(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "hundred" + else: + if string[3] == '0': + speak += pronounce_number_en(0) + " " + speak += pronounce_number_en(int(string[4])) + else: + speak += pronounce_number_en(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "midnight" + elif dt.hour == 12 and dt.minute == 0: + return "noon" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "quarter past " + pronounce_number_en(hour) + elif dt.minute == 30: + speak = "half past " + pronounce_number_en(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "quarter to " + pronounce_number_en(next_hour) + else: + speak = pronounce_number_en(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + " o'clock" + else: + if dt.minute < 10: + speak += " oh" + speak += " " + pronounce_number_en(dt.minute) + + if use_ampm: + if dt.hour > 11: + speak += " p.m." + else: + speak += " a.m." + + return speak diff --git a/lingua_franca/lang/format_es.py b/lingua_franca/lang/format_es.py new file mode 100644 index 0000000..a224af5 --- /dev/null +++ b/lingua_franca/lang/format_es.py @@ -0,0 +1,269 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Format functions for castillian (es-es) + +""" +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_es import _NUM_STRING_ES, \ + _FRACTION_STRING_ES + + +def nice_number_es(number, speech=True, denominators=range(1, 21)): + """ Spanish helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 y medio" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + strNumber = "" + whole = 0 + num = 0 + den = 0 + + result = convert_to_mixed_fraction(number, denominators) + + if not result: + # Give up, just represent as a 3 decimal number + whole = round(number, 3) + else: + whole, num, den = result + + if not speech: + if num == 0: + strNumber = '{:,}'.format(whole) + strNumber = strNumber.replace(",", " ") + strNumber = strNumber.replace(".", ",") + return strNumber + else: + return '{} {}/{}'.format(whole, num, den) + else: + if num == 0: + # if the number is not a fraction, nothing to do + strNumber = str(whole) + strNumber = strNumber.replace(".", ",") + return strNumber + den_str = _FRACTION_STRING_ES[den] + # if it is not an integer + if whole == 0: + # if there is no whole number + if num == 1: + # if numerator is 1, return "un medio", for example + strNumber = 'un {}'.format(den_str) + else: + # else return "cuatro tercios", for example + strNumber = '{} {}'.format(num, den_str) + elif num == 1: + # if there is a whole number and numerator is 1 + if den == 2: + # if denominator is 2, return "1 y medio", for example + strNumber = '{} y {}'.format(whole, den_str) + else: + # else return "1 y 1 tercio", for example + strNumber = '{} y 1 {}'.format(whole, den_str) + else: + # else return "2 y 3 cuarto", for example + strNumber = '{} y {} {}'.format(whole, num, den_str) + if num > 1 and den != 3: + # if the numerator is greater than 1 and the denominator + # is not 3 ("tercio"), add an s for plural + strNumber += 's' + + return strNumber + + +def pronounce_number_es(number, places=2): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'cinco coma dos' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + Returns: + (str): The pronounced number + """ + if abs(number) >= 100: + # TODO: Soporta a números por encima de 100 + return str(number) + + result = "" + if number < 0: + result = "menos " + number = abs(number) + + # del 21 al 29 tienen una pronunciación especial + if 20 <= number <= 29: + tens = int(number-int(number) % 10) + ones = int(number - tens) + result += _NUM_STRING_ES[tens] + if ones > 0: + result = result[:-1] + # a veinte le quitamos la "e" final para construir los + # números del 21 - 29. Pero primero tenemos en cuenta + # las excepciones: 22, 23 y 26, que llevan tilde. + if ones == 2: + result += "idós" + elif ones == 3: + result += "itrés" + elif ones == 6: + result += "iséis" + else: + result += "i" + _NUM_STRING_ES[ones] + elif number >= 30: # de 30 en adelante + tens = int(number-int(number) % 10) + ones = int(number - tens) + result += _NUM_STRING_ES[tens] + if ones > 0: + result += " y " + _NUM_STRING_ES[ones] + else: + result += _NUM_STRING_ES[int(number)] + + # Deal with decimal part, in spanish is commonly used the comma + # instead the dot. Decimal part can be written both with comma + # and dot, but when pronounced, its pronounced "coma" + if not number == int(number) and places > 0: + if abs(number) < 1.0 and (result == "menos " or not result): + result += "cero" + result += " coma" + _num_str = str(number) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + _NUM_STRING_ES[int(char)] + return result + + +def nice_time_es(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'cinco treinta' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # Tenemos que tener en cuenta que cuando hablamos en formato + # 24h, no hay que especificar ninguna precisión adicional + # como "la noche", "la tarde" o "la mañana" + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + if dt.hour == 1: + speak += "la una" + else: + speak += "las " + pronounce_number_es(dt.hour) + + # las 14:04 son "las catorce cero cuatro" + if dt.minute < 10: + speak += " cero " + pronounce_number_es(dt.minute) + else: + speak += " " + pronounce_number_es(dt.minute) + + else: + # Prepare for "tres menos cuarto" ?? + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0 or hour == 12: + speak += "las doce" + elif hour == 1 or hour == 13: + speak += "la una" + elif hour < 13: + speak = "las " + pronounce_number_es(hour) + else: + speak = "las " + pronounce_number_es(hour-12) + + if minute != 0: + # las horas especiales + if minute == 15: + speak += " y cuarto" + elif minute == 30: + speak += " y media" + elif minute == -15: + speak += " menos cuarto" + else: # seis y nueve. siete y veinticinco + if minute > 0: + speak += " y " + pronounce_number_es(minute) + else: # si son las siete menos veinte, no ponemos la "y" + speak += " " + pronounce_number_es(minute) + + # si no especificamos de la tarde, noche, mañana, etc + if minute == 0 and not use_ampm: + # 3:00 + speak += " en punto" + + if use_ampm: + # "de la noche" es desde que anochece hasta medianoche + # así que decir que es desde las 21h es algo subjetivo + # en España a las 20h se dice "de la tarde" + # en castellano, las 12h es de la mañana o mediodía + # así que diremos "de la tarde" a partir de las 13h. + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + if hour >= 0 and hour < 6: + speak += " de la madrugada" + elif hour >= 6 and hour < 13: + speak += " de la mañana" + elif hour >= 13 and hour < 21: + speak += " de la tarde" + else: + speak += " de la noche" + return speak diff --git a/lingua_franca/lang/format_fa.py b/lingua_franca/lang/format_fa.py new file mode 100644 index 0000000..04a4275 --- /dev/null +++ b/lingua_franca/lang/format_fa.py @@ -0,0 +1,301 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_fa import \ + _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FARSI_BIG, _FARSI_SEPERATOR, \ + _FARSI_FRAC, _FARSI_FRAC_BIG, _FRACTION_STRING_FA, _FORMAL_VARIANT +import math +from lingua_franca.internal import lookup_variant +from enum import IntEnum +from functools import wraps + +class NumberVariantFA(IntEnum): + CONVERSATIONAL = 0 + FORMAL = 1 + +lookup_number = lookup_variant({ + "default": NumberVariantFA.CONVERSATIONAL, + "conversational": NumberVariantFA.CONVERSATIONAL, + "formal": NumberVariantFA.FORMAL, +}) + +def _apply_number_variant(text, variant): + if variant == NumberVariantFA.FORMAL: + for key, value in _FORMAL_VARIANT.items(): + text = text.replace(value, key) + return text + +def _handle_number_variant(func): + + @wraps(func) + @lookup_variant({ + "default": NumberVariantFA.CONVERSATIONAL, + "conversational": NumberVariantFA.CONVERSATIONAL, + "formal": NumberVariantFA.FORMAL, + }) + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if 'variant' in kwargs: + return _apply_number_variant(result, kwargs['variant']) + else: + return result + return wrapper + +@_handle_number_variant +def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None): + """ Farsi helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_FA[den] + if whole == 0: + if num == 1: + return_string = 'یک {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} و یک {}'.format(whole, den_str) + else: + return_string = '{} و {} {}'.format(whole, num, den_str) + return return_string + + +def _float2tuple(value, _precision): + pre = int(value) + + post = abs(value - pre) * 10**_precision + if abs(round(post) - post) < 0.01: + # We generally floor all values beyond our precision (rather than + # rounding), but in cases where we have something like 1.239999999, + # which is probably due to python's handling of floats, we actually + # want to consider it as 1.24 instead of 1.23 + post = int(round(post)) + else: + post = int(math.floor(post)) + + while post != 0: + x, y = divmod(post, 10) + if y != 0: + break + post = x + _precision -= 1 + + return pre, post, _precision + + +def _cardinal3(number): + if (number < 19): + return _FARSI_ONES[number] + if (number < 100): + x, y = divmod(number, 10) + if y == 0: + return _FARSI_TENS[x] + return _FARSI_TENS[x] + _FARSI_SEPERATOR + _FARSI_ONES[y] + x, y = divmod(number, 100) + if y == 0: + return _FARSI_HUNDREDS[x] + return _FARSI_HUNDREDS[x] + _FARSI_SEPERATOR + _cardinal3(y) + +def _cardinalPos(number): + x = number + res = '' + for b in _FARSI_BIG: + x, y = divmod(x, 1000) + if (y == 0): + continue + yx = _cardinal3(y) + if y == 1 and b == 'هزار': + yx = b + elif b != '': + yx += ' ' + b + if (res == ''): + res = yx + else: + res = yx + _FARSI_SEPERATOR + res + return res + +def _fractional(number, l): + if (number / 10**l == 0.5): + return "نیم" + x = _cardinalPos(number) + ld3, lm3 = divmod(l, 3) + ltext = (_FARSI_FRAC[lm3] + " " + _FARSI_FRAC_BIG[ld3]).strip() + 'م' + return x + " " + ltext + +def _to_ordinal(number): + r = _to_cardinal(number, 0) + if (r[-1] == 'ه' and r[-2] == 'س'): + return r[:-1] + 'وم' + return r + 'م' + +def _to_ordinal_num(value): + return str(value)+"م" + +def _to_cardinal(number, places): + if number < 0: + return "منفی " + _to_cardinal(-number, places) + if (number == 0): + return "صفر" + x, y, l = _float2tuple(number, places) + if y == 0: + return _cardinalPos(x) + if x == 0: + return _fractional(y, l) + return _cardinalPos(x) + _FARSI_SEPERATOR + _fractional(y, l) + +@_handle_number_variant +def pronounce_number_fa(number, places=2, scientific=False, + ordinals=False, variant=None): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "بینهایت" + elif num == float("-inf"): + return "منفی بینهایت" + if scientific: + if number == 0: + return "صفر" + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + return '{}{} ضرب در ده به توان {}{}'.format( + 'منفی ' if float(n) < 0 else '', + pronounce_number_fa( + abs(float(n)), places, False, ordinals=False), + 'منفی ' if power < 0 else '', + pronounce_number_fa(abs(power), places, False, ordinals=False)) + if ordinals: + return _to_ordinal(number) + return _to_cardinal(number, places) + +@_handle_number_variant +def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_fa(int(string[1])) + else: + speak = pronounce_number_fa(int(string[0:2])) + if not string[3:5] == '00': + speak += " و " + if string[3] == '0': + speak += pronounce_number_fa(int(string[4])) + else: + speak += pronounce_number_fa(int(string[3:5])) + speak += ' دقیقه' + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "نیمه شب" + elif dt.hour == 12 and dt.minute == 0: + return "ظهر" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_number_fa(hour) + " و ربع" + elif dt.minute == 30: + speak = pronounce_number_fa(hour) + " و نیم" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "یه ربع به " + pronounce_number_fa(next_hour) + else: + speak = pronounce_number_fa(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + else: + speak += " و " + pronounce_number_fa(dt.minute) + ' دقیقه' + + if use_ampm: + if dt.hour > 11: + speak += " بعد از ظهر" + else: + speak += " قبل از ظهر" + + return speak diff --git a/lingua_franca/lang/format_fr.py b/lingua_franca/lang/format_fr.py new file mode 100644 index 0000000..5eea39c --- /dev/null +++ b/lingua_franca/lang/format_fr.py @@ -0,0 +1,251 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_fr import _NUM_STRING_FR, \ + _FRACTION_STRING_FR + + +def nice_number_fr(number, speech=True, denominators=range(1, 21)): + """ French helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 et demi" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + strNumber = "" + whole = 0 + num = 0 + den = 0 + + result = convert_to_mixed_fraction(number, denominators) + + if not result: + # Give up, just represent as a 3 decimal number + whole = round(number, 3) + else: + whole, num, den = result + + if not speech: + if num == 0: + strNumber = '{:,}'.format(whole) + strNumber = strNumber.replace(",", " ") + strNumber = strNumber.replace(".", ",") + return strNumber + else: + return '{} {}/{}'.format(whole, num, den) + else: + if num == 0: + # if the number is not a fraction, nothing to do + strNumber = str(whole) + strNumber = strNumber.replace(".", ",") + return strNumber + den_str = _FRACTION_STRING_FR[den] + # if it is not an integer + if whole == 0: + # if there is no whole number + if num == 1: + # if numerator is 1, return "un demi", for example + strNumber = 'un {}'.format(den_str) + else: + # else return "quatre tiers", for example + strNumber = '{} {}'.format(num, den_str) + elif num == 1: + # if there is a whole number and numerator is 1 + if den == 2: + # if denominator is 2, return "1 et demi", for example + strNumber = '{} et {}'.format(whole, den_str) + else: + # else return "1 et 1 tiers", for example + strNumber = '{} et 1 {}'.format(whole, den_str) + else: + # else return "2 et 3 quart", for example + strNumber = '{} et {} {}'.format(whole, num, den_str) + if num > 1 and den != 3: + # if the numerator is greater than 1 and the denominator + # is not 3 ("tiers"), add an s for plural + strNumber += 's' + + return strNumber + + +def pronounce_number_fr(number, places=2): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'cinq virgule deux' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + Returns: + (str): The pronounced number + """ + if abs(number) >= 100: + # TODO: Support for numbers over 100 + return str(number) + + result = "" + if number < 0: + result = "moins " + number = abs(number) + + if number > 16: + tens = int(number-int(number) % 10) + ones = int(number-tens) + if ones != 0: + if tens > 10 and tens <= 60 and int(number-tens) == 1: + result += _NUM_STRING_FR[tens] + "-et-" + _NUM_STRING_FR[ones] + elif number == 71: + result += "soixante-et-onze" + elif tens == 70: + result += _NUM_STRING_FR[60] + "-" + if ones < 7: + result += _NUM_STRING_FR[10 + ones] + else: + result += _NUM_STRING_FR[10] + "-" + _NUM_STRING_FR[ones] + elif tens == 90: + result += _NUM_STRING_FR[80] + "-" + if ones < 7: + result += _NUM_STRING_FR[10 + ones] + else: + result += _NUM_STRING_FR[10] + "-" + _NUM_STRING_FR[ones] + else: + result += _NUM_STRING_FR[tens] + "-" + _NUM_STRING_FR[ones] + else: + if number == 80: + result += "quatre-vingts" + else: + result += _NUM_STRING_FR[tens] + else: + result += _NUM_STRING_FR[int(number)] + + # Deal with decimal part + if not number == int(number) and places > 0: + if abs(number) < 1.0 and (result == "moins " or not result): + result += "zéro" + result += " virgule" + _num_str = str(number) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + _NUM_STRING_FR[int(char)] + return result + + +def nice_time_fr(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'cinq heures trente' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + + # "13 heures trente" + if dt.hour == 0: + speak += "minuit" + elif dt.hour == 12: + speak += "midi" + elif dt.hour == 1: + speak += "une heure" + else: + speak += pronounce_number_fr(dt.hour) + " heures" + + if dt.minute != 0: + speak += " " + pronounce_number_fr(dt.minute) + + else: + # Prepare for "trois heures moins le quart" + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0: + speak += "minuit" + elif hour == 12: + speak += "midi" + elif hour == 1 or hour == 13: + speak += "une heure" + elif hour < 13: + speak = pronounce_number_fr(hour) + " heures" + else: + speak = pronounce_number_fr(hour-12) + " heures" + + if minute != 0: + if minute == 15: + speak += " et quart" + elif minute == 30: + speak += " et demi" + elif minute == -15: + speak += " moins le quart" + else: + speak += " " + pronounce_number_fr(minute) + + if use_ampm: + if hour > 17: + speak += " du soir" + elif hour > 12: + speak += " de l'après-midi" + elif hour > 0 and hour < 12: + speak += " du matin" + + return speak diff --git a/lingua_franca/lang/format_hu.py b/lingua_franca/lang/format_hu.py new file mode 100644 index 0000000..f12a184 --- /dev/null +++ b/lingua_franca/lang/format_hu.py @@ -0,0 +1,307 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_hu import _NUM_POWERS_OF_TEN, \ + _EXTRA_SPACE_HU, _FRACTION_STRING_HU, _MONTHS_HU, _NUM_STRING_HU +from math import floor + + +def _get_vocal_type_hu(word): + # checks the vocal attributes of a word + vowels_high = len([char for char in word if char in 'eéiíöőüű']) + vowels_low = len([char for char in word if char in 'aáoóuú']) + if vowels_high != 0 and vowels_low != 0: + return 2 # 2: type is mixed + return 0 if vowels_high == 0 else 1 # 0: type is low, 1: is high + + +def nice_number_hu(number, speech=True, denominators=range(1, 21)): + """ Hungarian helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 és fél" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)).replace(".", ",") + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_HU[den] + if whole == 0: + if num == 1: + one = 'egy ' if den != 2 else '' + return_string = '{}{}'.format(one, den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + pointOne = 'egész egy' if den != 2 else 'és' + return_string = '{} {} {}'.format(whole, pointOne, den_str) + else: + return_string = '{} egész {} {}'.format(whole, num, den_str) + return return_string + + +def pronounce_number_hu(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # TODO short_scale, scientific and ordinals + # currently ignored + + def pronounce_triplet_hu(num): + result = "" + num = floor(num) + if num > 99: + hundreds = floor(num / 100) + if hundreds > 0: + hundredConst = _EXTRA_SPACE_HU + 'száz' + _EXTRA_SPACE_HU + if hundreds == 1: + result += hundredConst + elif hundreds == 2: + result += 'két' + hundredConst + else: + result += _NUM_STRING_HU[hundreds] + hundredConst + num -= hundreds * 100 + if num == 0: + result += '' # do nothing + elif num <= 20: + result += _NUM_STRING_HU[num] # + _EXTRA_SPACE_DA + elif num > 20: + ones = num % 10 + tens = num - ones + if tens > 0: + if tens != 20: + result += _NUM_STRING_HU[tens] + _EXTRA_SPACE_HU + else: + result += "huszon" + _EXTRA_SPACE_HU + if ones > 0: + result += _NUM_STRING_HU[ones] + _EXTRA_SPACE_HU + return result + + def pronounce_whole_number_hu(num, scale_level=0): + if num == 0: + return '' + + num = floor(num) + result = '' + last_triplet = num % 1000 + + if last_triplet == 1: + if scale_level == 0: + if result != '': + result += '' + "egy" + else: + result += "egy" + elif scale_level == 1: + result += _EXTRA_SPACE_HU + \ + _NUM_POWERS_OF_TEN[1] + _EXTRA_SPACE_HU + else: + result += "egy" + _NUM_POWERS_OF_TEN[scale_level] + elif last_triplet > 1: + result += pronounce_triplet_hu(last_triplet) + if scale_level != 0: + result = result.replace(_NUM_STRING_HU[2], 'két') + if scale_level == 1: + result += _NUM_POWERS_OF_TEN[1] + _EXTRA_SPACE_HU + if scale_level >= 2: + result += _NUM_POWERS_OF_TEN[scale_level] + if scale_level > 0: + result += '-' + + num = floor(num / 1000) + scale_level += 1 + return pronounce_whole_number_hu(num, + scale_level) + result + + result = "" + if abs(number) >= 1000000000000000000000000: # cannot do more than this + return str(number) + elif number == 0: + return str(_NUM_STRING_HU[0]) + elif number < 0: + return "mínusz " + pronounce_number_hu(abs(number), places) + else: + if number == int(number): + return pronounce_whole_number_hu(number).strip('-') + else: + whole_number_part = floor(number) + fractional_part = number - whole_number_part + if whole_number_part == 0: + result += _NUM_STRING_HU[0] + result += pronounce_whole_number_hu(whole_number_part) + if places > 0: + result += " egész " + fraction = pronounce_whole_number_hu( + round(fractional_part * 10 ** places)) + result += fraction.replace(_NUM_STRING_HU[2], 'két') + fraction_suffixes = [ + 'tized', 'század', 'ezred', 'tízezred', 'százezred'] + if places <= len(fraction_suffixes): + result += ' ' + fraction_suffixes[places - 1] + return result + + +def pronounce_ordinal_hu(number): + """ + This function pronounces a number as an ordinal + + 1 -> first + 2 -> second + + Args: + number (int): the number to format + Returns: + (str): The pronounced number string. + """ + ordinals = ["nulladik", "első", "második", "harmadik", "negyedik", + "ötödik", "hatodik", "hetedik", "nyolcadik", "kilencedik", + "tizedik"] + big_ordinals = ["", "ezredik", "milliomodik"] + + # only for whole positive numbers including zero + if number < 0 or number != int(number): + return number + elif number < 11: + return ordinals[number] + else: + # concatenate parts and inflect them accordingly + root = pronounce_number_hu(number) + vtype = _get_vocal_type_hu(root) + last_digit = number - floor(number / 10) * 10 + if root == "húsz": + root = "husz" + if number % 1000000 == 0: + return root.replace(_NUM_POWERS_OF_TEN[2], big_ordinals[2]) + if number % 1000 == 0: + return root.replace(_NUM_POWERS_OF_TEN[1], big_ordinals[1]) + if last_digit == 1: + return root + "edik" + elif root[-1] == 'ő': + return root[:-1] + 'edik' + elif last_digit != 0: + return ordinals[last_digit].join( + root.rsplit(_NUM_STRING_HU[last_digit], 1)) + return root + "edik" if vtype == 1 else root + "adik" + + +def nice_time_hu(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + speak += pronounce_number_hu(dt.hour) + speak = speak.replace(_NUM_STRING_HU[2], 'két') + speak += " óra" + if not dt.minute == 0: # zero minutes are not pronounced + speak += " " + pronounce_number_hu(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "éjfél" + if dt.hour == 12 and dt.minute == 0: + return "dél" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if dt.hour == 0: + speak += pronounce_number_hu(12) + elif dt.hour < 13: + speak = pronounce_number_hu(dt.hour) + else: + speak = pronounce_number_hu(dt.hour - 12) + + speak = speak.replace(_NUM_STRING_HU[2], 'két') + speak += " óra" + + if not dt.minute == 0: + speak += " " + pronounce_number_hu(dt.minute) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + speak = "délután " + speak # 12:01 - 17:59 + elif dt.hour < 22: + speak = "este " + speak # 18:00 - 21:59 este/evening + else: + speak = "éjjel " + speak # 22:00 - 23:59 éjjel/at night + elif dt.hour < 3: + speak = "éjjel " + speak # 00:01 - 02:59 éjjel/at night + else: + speak = "reggel " + speak # 03:00 - 11:59 reggel/in t. morning + + return speak diff --git a/lingua_franca/lang/format_it.py b/lingua_franca/lang/format_it.py new file mode 100644 index 0000000..52cf4e1 --- /dev/null +++ b/lingua_franca/lang/format_it.py @@ -0,0 +1,342 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_it import _NUM_STRING_IT, \ + _FRACTION_STRING_IT, _LONG_SCALE_IT, _SHORT_SCALE_IT + + +def nice_number_it(number, speech=True, denominators=range(1, 21)): + """ Italian helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 e un mezz" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + # denominatore + den_str = _FRACTION_STRING_IT[den] + # frazione + if whole == 0: + if num == 1: + # un decimo + return_string = 'un {}'.format(den_str) + else: + # tre mezzi + return_string = '{} {}'.format(num, den_str) + # interi >10 + elif num == 1: + # trenta e un + return_string = '{} e un {}'.format(whole, den_str) + # interi >10 con frazioni + else: + # venti e 3 decimi + return_string = '{} e {} {}'.format(whole, num, den_str) + + # gestisce il plurale del denominatore + if num > 1: + return_string += 'i' + else: + return_string += 'o' + + return return_string + + +def pronounce_number_it(number, places=2, short_scale=False, scientific=False): + """ + Convert a number to it's spoken equivalent + adapted to italian fron en version + + For example, '5.2' would return 'cinque virgola due' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + Returns: + (str): The pronounced number + """ + num = number + # gestione infinito + if num == float("inf"): + return "infinito" + elif num == float("-inf"): + return "meno infinito" + + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + return '{}{} per dieci elevato alla {}{}'.format( + 'meno ' if float(n) < 0 else '', + pronounce_number_it(abs(float(n)), places, short_scale, False), + 'meno ' if power < 0 else '', + pronounce_number_it(abs(power), places, short_scale, False)) + + if short_scale: + number_names = _NUM_STRING_IT.copy() + number_names.update(_SHORT_SCALE_IT) + else: + number_names = _NUM_STRING_IT.copy() + number_names.update(_LONG_SCALE_IT) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_IT[n] for n in _SHORT_SCALE_IT.keys()] + else: + hundreds = [_LONG_SCALE_IT[n] for n in _LONG_SCALE_IT.keys()] + + # deal with negatives + result = "" + if num < 0: + result = "meno " + num = abs(num) + + # check for a direct match + if num in number_names: + if num > 90: + result += "" # inizio stringa + result += number_names[num] + else: + def _sub_thousand(n): + assert 0 <= n <= 999 + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + _deci = tens[q-1] + _unit = r + _partial = _deci + if _unit > 0: + if _unit == 1 or _unit == 8: + _partial = _partial[:-1] # ventuno ventotto + _partial += number_names[_unit] + return _partial + else: + q, r = divmod(n, 100) + if q == 1: + _partial = "cento" + else: + _partial = digits[q] + "cento" + _partial += ( + " " + _sub_thousand(r) if r else "") # separa centinaia + return _partial + + def _short_scale(n): + if n >= max(_SHORT_SCALE_IT.keys()): + return "numero davvero enorme" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z) + if i: + number += "" # separa ordini grandezza + number += hundreds[i] + res.append(number) + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_IT.keys()): + return "numero davvero enorme" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_it(z, places, True, scientific) + # strip off the comma after the thousand + if i: + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + number += " " + hundreds[i+1] + res.append(number) + return ", ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # normalizza unità misura singole e 'ragionevoli' ed ad inizio stringa + if result == 'mila': + result = 'mille' + if result == 'milioni': + result = 'un milione' + if result == 'miliardi': + result = 'un miliardo' + if result[0:7] == 'unomila': + result = result.replace('unomila', 'mille', 1) + if result[0:10] == 'unomilioni': + result = result.replace('unomilioni', 'un milione', 1) + # if result[0:11] == 'unomiliardi': + # result = result.replace('unomiliardi', 'un miliardo', 1) + + # Deal with fractional part + if not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "meno " or not result): + result += "zero" + result += " virgola" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_it(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + adapted to italian fron en version + + For example, generate 'cinque e trenta' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + # Either "zero 8 zerozero" o "13 zerozero" + if string[0:2] == '00': + speak += "zerozero" + elif string[0] == '0': + speak += pronounce_number_it(int(string[0])) + " " + if int(string[1]) == 1: + speak = "una" + else: + speak += pronounce_number_it(int(string[1])) + else: + speak = pronounce_number_it(int(string[0:2])) + + # in italian "13 e 25" + speak += " e " + + if string[3:5] == '00': + speak += "zerozero" + else: + if string[3] == '0': + speak += pronounce_number_it(0) + " " + speak += pronounce_number_it(int(string[4])) + else: + speak += pronounce_number_it(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "mezzanotte" + if dt.hour == 12 and dt.minute == 0: + return "mezzogiorno" + # TODO: "10 e un quarto", "4 e tre quarti" and ot her idiomatic times + + if dt.hour == 0: + speak = "mezzanotte" + elif dt.hour == 1 or dt.hour == 13: + speak = "una" + elif dt.hour > 13: # era minore + speak = pronounce_number_it(dt.hour-12) + else: + speak = pronounce_number_it(dt.hour) + + speak += " e" + if dt.minute == 0: + speak = speak[:-2] + if not use_ampm: + speak += " in punto" + elif dt.minute == 15: + speak += " un quarto" + elif dt.minute == 45: + speak += " tre quarti" + else: + if dt.minute < 10: + speak += " zero" + speak += " " + pronounce_number_it(dt.minute) + + if use_ampm: + + if dt.hour < 4: + speak.strip() + elif dt.hour > 20: + speak += " della notte" + elif dt.hour > 17: + speak += " della sera" + elif dt.hour > 12: + speak += " del pomeriggio" + else: + speak += " della mattina" + + return speak diff --git a/lingua_franca/lang/format_nl.py b/lingua_franca/lang/format_nl.py new file mode 100644 index 0000000..cba2110 --- /dev/null +++ b/lingua_franca/lang/format_nl.py @@ -0,0 +1,337 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_nl import _NUM_POWERS_OF_TEN, \ + _NUM_STRING_NL, _FRACTION_STRING_NL, _EXTRA_SPACE_NL, _MONTHS_NL +from math import floor + + +def nice_number_nl(number, speech=True, denominators=range(1, 21)): + """ Dutch helper for nice_number + This function formats a float to human understandable functions. Like + 4.5 becomes "4 einhalb" for speech and "4 1/2" for text + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)).replace(".", ",") + whole, num, den = result + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_NL[den] + if whole == 0: + if num == 1: + return_string = 'één {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} en één {}'.format(whole, den_str) + else: + return_string = '{} en {} {}'.format(whole, num, den_str) + + return return_string + + +def pronounce_number_nl(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # TODO short_scale, scientific and ordinals + # currently ignored + + def pronounce_triplet_nl(num): + result = "" + num = floor(num) + if num > 99: + hundreds = floor(num / 100) + if hundreds > 0: + result += _NUM_STRING_NL[ + hundreds] + _EXTRA_SPACE_NL + 'honderd' + _EXTRA_SPACE_NL + num -= hundreds * 100 + if num == 0: + result += '' # do nothing + elif num <= 20: + result += _NUM_STRING_NL[num] # + _EXTRA_SPACE_DA + elif num > 20: + ones = num % 10 + tens = num - ones + if ones > 0: + result += _NUM_STRING_NL[ones] + _EXTRA_SPACE_NL + if tens > 0: + result += 'en' + _EXTRA_SPACE_NL + if tens > 0: + result += _NUM_STRING_NL[tens] + _EXTRA_SPACE_NL + return result + + def pronounce_fractional_nl(num, + places): # fixed number of places even with + # trailing zeros + result = "" + place = 10 + while places > 0: # doesn't work with 1.0001 and places = 2: int( + # number*place) % 10 > 0 and places > 0: + result += " " + _NUM_STRING_NL[int(num * place) % 10] + if int(num * place) % 10 == 1: + result += '' # "1" is pronounced "eins" after the decimal + # point + place *= 10 + places -= 1 + return result + + def pronounce_whole_number_nl(num, scale_level=0): + if num == 0: + return '' + + num = floor(num) + result = '' + last_triplet = num % 1000 + + if last_triplet == 1: + if scale_level == 0: + if result != '': + result += '' + 'één' + else: + result += "één" + elif scale_level == 1: + result += 'één' + _EXTRA_SPACE_NL + 'duizend' + _EXTRA_SPACE_NL + else: + result += "één " + _NUM_POWERS_OF_TEN[scale_level] + ' ' + elif last_triplet > 1: + result += pronounce_triplet_nl(last_triplet) + if scale_level == 1: + # result += _EXTRA_SPACE_DA + result += 'duizend' + _EXTRA_SPACE_NL + if scale_level >= 2: + # if _EXTRA_SPACE_DA == '': + # result += " " + result += " " + _NUM_POWERS_OF_TEN[scale_level] + ' ' + if scale_level >= 2: + if scale_level % 2 == 0: + result += "" # Miljioen + result += "" # Miljard, Miljoen + + num = floor(num / 1000) + scale_level += 1 + return pronounce_whole_number_nl(num, + scale_level) + result + '' + + result = "" + if abs(number) >= 1000000000000000000000000: # cannot do more than this + return str(number) + elif number == 0: + return str(_NUM_STRING_NL[0]) + elif number < 0: + return "min " + pronounce_number_nl(abs(number), places) + else: + if number == int(number): + return pronounce_whole_number_nl(number) + else: + whole_number_part = floor(number) + fractional_part = number - whole_number_part + result += pronounce_whole_number_nl(whole_number_part) + if places > 0: + result += " komma" + result += pronounce_fractional_nl(fractional_part, places) + return result + + +def pronounce_ordinal_nl(number): + """ + This function pronounces a number as an ordinal + + 1 -> first + 2 -> second + + Args: + number (int): the number to format + Returns: + (str): The pronounced number string. + """ + ordinals = ["nulste", "eerste", "tweede", "derde", "vierde", "vijfde", + "zesde", "zevende", "achtste"] + # only for whole positive numbers including zero + if number < 0 or number != int(number): + return number + if number < 4: + return ordinals[number] + if number < 8: + return pronounce_number_nl(number) + "de" + if number < 9: + return pronounce_number_nl(number) + "ste" + if number < 20: + return pronounce_number_nl(number) + "de" + return pronounce_number_nl(number) + "ste" + + +def nice_time_nl(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + speak += pronounce_number_nl(dt.hour) + speak += " uur" + if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is + # "13 uur" not "13 hundred hours" + speak += " " + pronounce_number_nl(dt.minute) + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "Middernacht" + hour = dt.hour % 12 + if dt.minute == 0: + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + speak += " uur" + elif dt.minute == 30: + speak += "half " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute == 15: + speak += "kwart over " + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute == 45: + speak += "kwart voor " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute > 30: + speak += pronounce_number_nl(60 - dt.minute) + speak += " voor " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + else: + speak += pronounce_number_nl(dt.minute) + speak += " over " + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + + if use_ampm: + speak += nice_part_of_day_nl(dt) + + return speak + + +def _fix_hour_nl(hour): + hour = hour % 12 + if hour == 0: + hour = 12 + return hour + + +def nice_part_of_day_nl(dt, speech=True): + if dt.hour < 6: + return " 's nachts" + if dt.hour < 12: + return " 's ochtends" + if dt.hour < 18: + return " 's middags" + if dt.hour < 24: + return " 's avonds" + raise ValueError('dt.hour is bigger than 24') + + +def nice_response_nl(text): + # check for months and call _nice_ordinal_nl declension of ordinals + # replace "^" with "tot de macht" (to the power of) + words = text.split() + + for idx, word in enumerate(words): + if word.lower() in _MONTHS_NL: + text = _nice_ordinal_nl(text) + + if word == '^': + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + if wordNext.isnumeric(): + words[idx] = "tot de macht" + text = " ".join(words) + return text + + +def _nice_ordinal_nl(text, speech=True): + # check for months for declension of ordinals before months + # depending on articles/prepositions + normalized_text = text + words = text.split() + for idx, word in enumerate(words): + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordPrev = words[idx - 1] if idx > 0 else "" + if word[:-1].isdecimal(): + if wordNext.lower() in _MONTHS_NL: + if wordPrev == 'de': + word = pronounce_ordinal_nl(int(word)) + else: + word = pronounce_number_nl(int(word)) + words[idx] = word + normalized_text = " ".join(words) + return normalized_text diff --git a/lingua_franca/lang/format_pl.py b/lingua_franca/lang/format_pl.py new file mode 100644 index 0000000..483a0fc --- /dev/null +++ b/lingua_franca/lang/format_pl.py @@ -0,0 +1,351 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ + _FRACTION_STRING_PL, _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _ALT_ORDINALS_PL +from lingua_franca.internal import FunctionNotLocalizedError + + +def nice_number_pl(number, speech=True, denominators=range(1, 21)): + """ English helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_PL[den] + if whole == 0: + return_string = '{} {}'.format(num, den_str) + else: + return_string = '{} i {} {}'.format(whole, num, den_str) + if num > 1: + return_string = return_string[:-1] + 'e' + return return_string + + +def pronounce_number_pl(num, places=2, short_scale=True, scientific=False, + ordinals=False, scientific_run=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # deal with infinity + if num == float("inf"): + return "nieskończoność" + elif num == float("-inf"): + return "minus nieskończoność" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} razy dziesięć do {}{} potęgi'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_pl( + abs(float(n)), places, short_scale, False, ordinals=False, scientific_run=True), + 'minus ' if power < 0 else '', + pronounce_number_pl(abs(power), places, short_scale, False, ordinals=True, scientific_run=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} razy dziesięć do potęgi {}{}'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_pl( + abs(float(n)), places, short_scale, False), + 'minus ' if power < 0 else '', + pronounce_number_pl(abs(power), places, short_scale, False)) + + number_names = _NUM_STRING_PL.copy() + number_names.update(_SHORT_SCALE_PL) + + digits = [number_names[n] for n in range(0, 20)] + if ordinals: + tens = [_SHORT_ORDINAL_PL[n] for n in range(10, 100, 10)] + else: + tens = [number_names[n] for n in range(10, 100, 10)] + hundreds = [_SHORT_SCALE_PL[n] for n in _SHORT_SCALE_PL.keys()] + + # deal with negatives + result = "" + if num < 0: + result = "minus " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False, iteration=0): + assert 0 <= n <= 999 + + _, n_mod = divmod(n, 10) + if iteration > 0 and n in _ALT_ORDINALS_PL and ordinals: + return _ALT_ORDINALS_PL[n] + elif n in _SHORT_ORDINAL_PL and ordinals: + return _SHORT_ORDINAL_PL[n] if not scientific_run \ + else _ALT_ORDINALS_PL[n] + if n <= 19: + return digits[n] if not scientific_run or not ordinals\ + else digits[n][:-1] + "ej" + elif n <= 99: + q, r = divmod(n, 10) + tens_text = tens[q - 1] + if scientific_run: + tens_text = tens_text[:-1] + "ej" + return tens_text + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + digit_name = digits[q] + if q*100 in _NUM_STRING_PL: + digit_name = _NUM_STRING_PL[q*100] + + return digit_name + ( + " " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= max(_SHORT_SCALE_PL.keys()): + return "nieskończoność" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, ordi, iteration=i) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + if i * 1000 in _SHORT_ORDINAL_PL: + if z == 1: + number = _SHORT_ORDINAL_PL[i * 1000] + else: + number += _SHORT_ORDINAL_PL[i * 1000] + else: + if n not in _SHORT_SCALE_PL: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SHORT_SCALE_PL[num] + "owa" + else: + number = _SHORT_SCALE_PL[n] + "ty" + else: + hundreds_text = _SHORT_SCALE_PL[float(pow(1000, i))] + if z != 1: + _, z_mod = divmod(z, 10) + _, z_mod_tens = divmod(z, 100) + n_main, _ = divmod(z_mod_tens, 10) + if i == 1: + if n_main != 1 and 5 > z_mod > 0: + hundreds_text += "e" + else: + hundreds_text = "tysięcy" + elif i > 1: + hundreds_text += "y" if 5 > z_mod > 0 else "ów" + + number += hundreds_text + res.append(number) + ordi = False + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + result += _short_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_pl(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "minus " or not result): + result += "zero" + result += " przecinek" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_pl(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + string = dt.strftime("%H:%M") + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0:2] == '00': + speak = "" + elif string[0] == '0': + speak += pronounce_number_pl(int(string[1]), ordinals=True) + speak = speak[:-1] + 'a' + else: + speak = pronounce_number_pl(int(string[0:2]), ordinals=True) + speak = speak[:-1] + 'a' + + speak += ' ' if string[0:2] != '00' else '' + if string[3:5] == '00': + speak += 'zero zero' + else: + if string[3] == '0': + speak += pronounce_number_pl(int(string[4])) + else: + speak += pronounce_number_pl(int(string[3:5])) + + if string[0:2] == '00': + speak += " po północy" + return speak + + +def nice_duration_pl(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + # TODO this is a kludge around the fact that only Polish has a + # localized nice_duration() + if not speech: + raise FunctionNotLocalizedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + sec_main, sec_div = divmod(seconds, 10) + min_main, min_div = divmod(minutes, 10) + hour_main, hour_div = divmod(hours, 10) + + if days > 0: + out += pronounce_number_pl(days) + " " + if days == 1: + out += 'dzień' + else: + out += 'dni' + if hours > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(hours) + " " + if hours == 1: + out += 'godzina' + elif hour_main == 1 or hour_div > 4: + out += 'godzin' + else: + out += 'godziny' + if minutes > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(minutes) + " " + if minutes == 1: + out += 'minuta' + elif min_main == 1 or min_div > 4: + out += 'minut' + else: + out += 'minuty' + if seconds > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(seconds) + " " + if sec_div == 0: + out += 'sekund' + elif seconds == 1: + out += 'sekunda' + elif sec_main == 1 or sec_div > 4: + out += 'sekund' + else: + out += 'sekundy' + + return out + + +def get_pronounce_number_for_duration(num): + pronounced = pronounce_number_pl(num) + + return 'jedna' if pronounced == 'jeden' else pronounced diff --git a/lingua_franca/lang/format_pt.py b/lingua_franca/lang/format_pt.py new file mode 100644 index 0000000..7c8107e --- /dev/null +++ b/lingua_franca/lang/format_pt.py @@ -0,0 +1,223 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \ + _NUM_STRING_PT + + +def nice_number_pt(number, speech, denominators=range(1, 21)): + """ Portuguese helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 e meio" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + # denominador + den_str = _FRACTION_STRING_PT[den] + # fracções + if whole == 0: + if num == 1: + # um décimo + return_string = 'um {}'.format(den_str) + else: + # três meio + return_string = '{} {}'.format(num, den_str) + # inteiros >10 + elif num == 1: + # trinta e um + return_string = '{} e {}'.format(whole, den_str) + # inteiros >10 com fracções + else: + # vinte e 3 décimo + return_string = '{} e {} {}'.format(whole, num, den_str) + # plural + if num > 1: + return_string += 's' + return return_string + + +def pronounce_number_pt(number, places=2): + """ + Convert a number to it's spoken equivalent + For example, '5.2' would return 'cinco virgula dois' + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + Returns: + (str): The pronounced number + """ + if abs(number) >= 100: + # TODO: Support n > 100 + return str(number) + + result = "" + if number < 0: + result = "menos " + number = abs(number) + + if number >= 20: + tens = int(number - int(number) % 10) + ones = int(number - tens) + result += _NUM_STRING_PT[tens] + if ones > 0: + result += " e " + _NUM_STRING_PT[ones] + else: + result += _NUM_STRING_PT[int(number)] + + # Deal with decimal part, in portuguese is commonly used the comma + # instead the dot. Decimal part can be written both with comma + # and dot, but when pronounced, its pronounced "virgula" + if not number == int(number) and places > 0: + if abs(number) < 1.0 and (result == "menos " or not result): + result += "zero" + result += " vírgula" + _num_str = str(number) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + _NUM_STRING_PT[int(char)] + return result + + +def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'cinco treinta' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # simply speak the number + if dt.hour == 1: + speak += "uma" + else: + speak += pronounce_number_pt(dt.hour) + + # equivalent to "quarter past ten" + if dt.minute > 0: + speak += " e " + pronounce_number_pt(dt.minute) + + else: + # speak number and add daytime identifier + # (equivalent to "in the morning") + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0: + speak += "meia noite" + elif hour == 12: + speak += "meio dia" + # 1 and 2 are pronounced in female form when talking about hours + elif hour == 1 or hour == 13: + speak += "uma" + elif hour == 2 or hour == 14: + speak += "duas" + elif hour < 13: + speak = pronounce_number_pt(hour) + else: + speak = pronounce_number_pt(hour - 12) + + if minute != 0: + if minute == 15: + speak += " e um quarto" + elif minute == 30: + speak += " e meia" + elif minute == -15: + speak += " menos um quarto" + else: + if minute > 0: + speak += " e " + pronounce_number_pt(minute) + else: + speak += " " + pronounce_number_pt(minute) + + # exact time + if minute == 0 and not use_ampm: + # 3:00 + speak += " em ponto" + + if use_ampm: + if hour > 0 and hour < 6: + speak += " da madrugada" + elif hour >= 6 and hour < 12: + speak += " da manhã" + elif hour >= 13 and hour < 21: + speak += " da tarde" + elif hour != 0 and hour != 12: + speak += " da noite" + return speak diff --git a/lingua_franca/lang/format_ru.py b/lingua_franca/lang/format_ru.py new file mode 100644 index 0000000..e87a925 --- /dev/null +++ b/lingua_franca/lang/format_ru.py @@ -0,0 +1,474 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_ru import _NUM_STRING_RU, \ + _FRACTION_STRING_RU, _LONG_SCALE_RU, _SHORT_SCALE_RU, _SHORT_ORDINAL_RU, _LONG_ORDINAL_RU +from lingua_franca.internal import FunctionNotLocalizedError + + +def nice_number_ru(number, speech=True, denominators=range(1, 21)): + """ English helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_RU[den] + if whole == 0: + if num == 1 and den <= 4: + return_string = '{}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1 and den == 2: + return_string = '{} с половиной'.format(whole) + else: + return_string = '{} и {} {}'.format(whole, num, den_str) + if 2 <= den <= 4: + if 2 <= num <= 4: + return_string = return_string[:-1] + 'и' + elif num > 4: + return_string = return_string[:-1] + 'ей' + elif den >= 5: + if 2 <= num <= 4: + return_string = return_string[:-2] + 'ые' + elif num > 4: + return_string = return_string[:-2] + 'ых' + + return return_string + + +def pronounce_number_ru(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "бесконечность" + elif num == float("-inf"): + return "минус бесконечность" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negative powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} на десять в {}{} степени'.format( + 'минус ' if float(n) < 0 else '', + pronounce_number_ru( + abs(float(n)), places, short_scale, False, ordinals=True), + 'минус ' if power < 0 else '', + pronounce_number_ru(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles negative powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} на десять в степени {}{}'.format( + 'минус ' if float(n) < 0 else '', + pronounce_number_ru( + abs(float(n)), places, short_scale, False, ordinals=False), + 'минус ' if power < 0 else '', + pronounce_number_ru(abs(power), places, short_scale, False, ordinals=False)) + + if short_scale: + number_names = _NUM_STRING_RU.copy() + number_names.update(_SHORT_SCALE_RU) + else: + number_names = _NUM_STRING_RU.copy() + number_names.update(_LONG_SCALE_RU) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_RU[n] for n in _SHORT_SCALE_RU.keys()] + else: + hundreds = [_LONG_SCALE_RU[n] for n in _LONG_SCALE_RU.keys()] + + # deal with negative numbers + result = "" + if num < 0: + result = "минус " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_RU and ordinals: + return _SHORT_ORDINAL_RU[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return _NUM_STRING_RU[q * 100] + (" " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n > max(_SHORT_SCALE_RU.keys()): + return "бесконечность" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, not i and ordi) + + if i: + if i >= len(hundreds): + return "" + if ordi: + if i * 1000 in _SHORT_ORDINAL_RU: + if z == 1: + number = _SHORT_ORDINAL_RU[i * 1000] + else: + if z > 5: + number = number[:-1] + "и" + number += _SHORT_ORDINAL_RU[i * 1000] + else: + if n not in _SHORT_SCALE_RU: + num = int("1" + "0" * (len(str(n)) // 3 * 3)) + + if number[-3:] == "два": + number = number[:-1] + "ух" + elif number[-2:] == "ри" or number[-2:] == "ре": + number = number[:-1] + "ёх" + elif number[-1:] == "ь": + number = number[:-1] + "и" + + number += _SHORT_SCALE_RU[num] + "ный" + else: + number = _SHORT_SCALE_RU[n] + "ный" + elif z == 1: + number = hundreds[i - 1] + else: + if i == 1: + if z % 10 == 1 and z % 100 // 10 != 1: + number = number[:-2] + "на" + elif z % 10 == 2 and z % 100 // 10 != 1: + number = number[:-1] + "е" + number += " " + plural_ru(z, "тысяча", "тысячи", "тысяч") + elif 1 <= z % 10 <= 4 and z % 100 // 10 != 1: + number += " " + hundreds[i - 1] + "а" + else: + number += " " + hundreds[i - 1] + "ов" + + res.append(number) + ordi = False + + return " ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_RU.keys()): + return "бесконечность" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_ru(z, places, True, scientific, + ordinals=ordi and not i) + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + + if ordi: + if (i + 1) * 1000000 in _LONG_ORDINAL_RU: + if z == 1: + number = _LONG_ORDINAL_RU[ + (i + 1) * 1000000] + else: + number += _LONG_ORDINAL_RU[ + (i + 1) * 1000000] + else: + if n not in _LONG_SCALE_RU: + num = int("1" + "0" * (len(str(n)) // 3 * 3)) + + if number[-3:] == "два": + number = number[:-1] + "ух" + elif number[-2:] == "ри" or number[-2:] == "ре": + number = number[:-1] + "ёх" + elif number[-1:] == "ь": + number = number[:-1] + "и" + + number += _LONG_SCALE_RU[num] + "ный" + else: + number = " " + _LONG_SCALE_RU[n] + "ный" + elif z == 1: + number = hundreds[i] + elif z <= 4: + number += " " + hundreds[i] + "а" + else: + number += " " + hundreds[i] + "ов" + + res.append(number) + return " ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_ru(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "минус " or not result): + result += "ноль" + result += " точка" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_ru(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if dt.hour < 4: + string += " ночи" + elif dt.hour < 12: + string += " утра" + elif dt.hour < 18: + string += " дня" + else: + string += " вечера" + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_hour_ru(int(string[0])) + " " + speak += pronounce_number_ru(int(string[1])) + else: + speak = pronounce_hour_ru(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "ровно" + else: + if string[3] == '0': + speak += pronounce_number_ru(0) + " " + speak += pronounce_number_ru(int(string[4])) + else: + speak += pronounce_number_ru(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "полночь" + elif dt.hour == 12 and dt.minute == 0: + return "полдень" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_hour_ru(hour) + " с четвертью" + elif dt.minute == 30: + speak = pronounce_hour_ru(hour) + " с половиной" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "без четверти " + pronounce_hour_ru(next_hour) + else: + speak = pronounce_hour_ru(hour) + + if dt.minute == 0: + if not use_ampm: + if dt.hour % 12 == 1: + return speak + return speak + " " + plural_ru(dt.hour % 12, "час", "часа", "часов") + else: + if dt.minute < 10: + speak += " ноль" + speak += " " + pronounce_number_ru(dt.minute) + + if use_ampm: + if dt.hour < 4: + speak += " ночи" + elif dt.hour < 12: + speak += " утра" + elif dt.hour < 18: + speak += " дня" + else: + speak += " вечера" + + return speak + + +def nice_duration_ru(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + if not speech: + raise FunctionNotLocalizedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + + if days > 0: + out += pronounce_number_ru(days) + out += " " + plural_ru(days, "день", "дня", "дней") + if hours > 0: + if out: + out += " " + out += pronounce_number_ru(hours) + out += " " + plural_ru(hours, "час", "часа", "часов") + if minutes > 0: + if out: + out += " " + out += pronounce_number_feminine_ru(minutes) + out += " " + plural_ru(minutes, "минута", "минуты", "минут") + if seconds > 0: + if out: + out += " " + out += pronounce_number_feminine_ru(seconds) + out += " " + plural_ru(seconds, "секунда", "секунды", "секунд") + + return out + + +def pronounce_hour_ru(num): + if num == 1: + return "час" + return pronounce_number_ru(num) + + +def pronounce_number_feminine_ru(num): + pronounced = pronounce_number_ru(num) + + num %= 100 + if num % 10 == 1 and num // 10 != 1: + return pronounced[:-2] + "на" + elif num % 10 == 2 and num // 10 != 1: + return pronounced[:-1] + "е" + + return pronounced + + +def plural_ru(num: int, one: str, few: str, many: str): + num %= 100 + if num // 10 == 1: + return many + if num % 10 == 1: + return one + if 2 <= num % 10 <= 4: + return few + return many diff --git a/lingua_franca/lang/format_sl.py b/lingua_franca/lang/format_sl.py new file mode 100644 index 0000000..6a48802 --- /dev/null +++ b/lingua_franca/lang/format_sl.py @@ -0,0 +1,419 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.common_data_sl import _NUM_STRING_SL, \ + _FRACTION_STRING_SL, _LONG_SCALE_SL, _SHORT_SCALE_SL, _SHORT_ORDINAL_SL +from lingua_franca.lang.format_common import convert_to_mixed_fraction + + +def nice_number_sl(number, speech=True, denominators=range(1, 21)): + """ Slovenian helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "2 in polovica" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_SL[den] + if whole == 0: + return_string = '{} {}'.format(num, den_str) + else: + return_string = '{} in {} {}'.format(whole, num, den_str) + + if num % 100 == 1: + pass + elif num % 100 == 2: + return_string = return_string[:-1] + 'i' + elif num % 100 == 3 or num % 100 == 4: + return_string = return_string[:-1] + 'e' + else: + return_string = return_string[:-1] + + return return_string + + +def pronounce_number_sl(num, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'pet celih dve' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # deal with infinity + if num == float("inf"): + return "neskončno" + elif num == float("-inf"): + return "minus neskončno" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} krat deset na {}{}'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_sl( + abs(float(n)), places, short_scale, False, ordinals=False), + 'minus ' if power < 0 else '', + pronounce_number_sl(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} krat deset na {}{}'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_sl( + abs(float(n)), places, short_scale, False), + 'minus ' if power < 0 else '', + pronounce_number_sl(abs(power), places, short_scale, False)) + + if short_scale: + number_names = _NUM_STRING_SL.copy() + number_names.update(_SHORT_SCALE_SL) + else: + number_names = _NUM_STRING_SL.copy() + number_names.update(_LONG_SCALE_SL) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_SL[n] for n in _SHORT_SCALE_SL.keys()] + else: + hundreds = [_LONG_SCALE_SL[n] for n in _LONG_SCALE_SL.keys()] + + # deal with negatives + result = "" + if num < 0: + result = "minus " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False, is_male=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_SL and ordinals: + return _SHORT_ORDINAL_SL[n] + if n <= 19: + if is_male and n == 2: + return digits[n][:-1] + "a" + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + sub = _sub_thousand(r, False) + if r == 2: + sub = sub[:-1] + "a" + return ((sub + "in") if r else "") + ( + tens[q - 1]) + ("i" if ordinals else "") + else: + q, r = divmod(n, 100) + if q == 1: + qstr = "" + else: + qstr = digits[q] + return (qstr + "sto" + ( + " " + _sub_thousand(r, ordinals) if r else "")) + + def _plural_hundreds(n, hundred, ordi=True): + if hundred[-3:] != "jon": + if ordi: + return hundred + "i" + + return hundred + + if n < 1000 or short_scale: + if ordi: + return hundred + "ti" + + if n % 100 == 1: + return hundred + elif n % 100 == 2: + return hundred + "a" + elif n % 100 == 3 or n % 100 == 4: + return hundred + "i" + else: + return hundred + "ov" + else: + n //= 1000 + + if ordi: + return hundred[:-3] + "jardti" + + if n % 100 == 1: + return hundred[:-3] + "jarda" + elif n % 100 == 2: + return hundred[:-3] + "jardi" + elif n % 100 == 3 or n % 100 == 4: + return hundred[:-3] + "jarde" + else: + return hundred[:-3] + "jard" + + def _short_scale(n): + if n >= max(_SHORT_SCALE_SL.keys()): + return "neskončno" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + + split = _split_by(n, 1000) + if ordinals and len([a for a in split if a > 0]) == 1: + ordi_force = True + else: + ordi_force = False + + for i, z in enumerate(split): + if not z: + continue + + if z == 1 and i == 1: + number = "" + elif z > 100 and z % 100 == 2: + number = _sub_thousand(z, not i and ordi, is_male=True) + elif z > 100 and z % 100 == 3: + number = _sub_thousand(z, not i and ordi) + "je" + elif z > 1 or i == 0 or ordi: + number = _sub_thousand(z, not i and ordi) + else: + number = "" + + if i: + if i >= len(hundreds): + return "" + if z > 1: + number += " " + number += _plural_hundreds( + z, hundreds[i], True if ordi_force else not i and ordi) + res.append(number) + ordi = False + + return " ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_SL.keys()): + return "neskončno" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + + split = _split_by(n, 1000000) + if ordinals and len([a for a in split if a > 0]) == 1: + ordi_force = True + else: + ordi_force = False + + for i, z in enumerate(split): + if not z: + continue + + number = pronounce_number_sl(z, places, True, scientific) + if z > 100: + add = number.split()[0] + " " + else: + add = "" + if z % 100 == 2 and i >= 1: + number = add + digits[2][:-1] + "a" + if z % 100 == 3 and i >= 1: + number = add + digits[3] + "je" + + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + hundred = _plural_hundreds( + z, hundreds[i + 1], True if ordi_force else ordi and not i) + + if z >= 1000: + z //= 1000 + number = pronounce_number_sl(z, places, True, scientific, + ordinals=True if ordi_force else ordi and not i) + + if z == 1: + number = hundred + else: + number += " " + hundred + res.append(number) + return " ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + if ordinals: + result = result.replace(" ", "") + + # deal with scientific notation unpronounceable as number + if (not result or result == "neskončno") and "e" in str(num): + return pronounce_number_sl(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "minus " or not result): + result += "nič" + + if int(abs(num)) % 100 == 1: + result += " cela" + elif int(abs(num)) % 100 == 2: + result += " celi" + elif int(abs(num)) % 100 == 3 or int(abs(num)) % 100 == 4: + result += " cele" + else: + result += " celih" + + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_sl(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'pet trideset' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + def _hour_declension(hour): + speak = pronounce_number_sl(hour) + + if hour == 1: + return speak[:-1] + "ih" + elif hour == 2 or hour == 4: + return speak + "h" + elif hour == 3: + return speak[:-1] + "eh" + elif hour == 7 or hour == 8: + return speak[:-2] + "mih" + else: + return speak + "ih" + + # Generate a speakable version of the time + if use_24hour: + # "13 nič nič" + speak = pronounce_number_sl(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "nič nič" + else: + if string[3] == '0': + speak += pronounce_number_sl(0) + " " + speak += pronounce_number_sl(int(string[4])) + else: + speak += pronounce_number_sl(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "polnoč" + elif dt.hour == 12 and dt.minute == 0: + return "poldne" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 0: + speak = pronounce_number_sl(hour) + elif dt.minute < 30: + speak = pronounce_number_sl( + dt.minute) + " čez " + pronounce_number_sl(hour) + elif dt.minute == 30: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "pol " + _hour_declension(next_hour) + elif dt.minute > 30: + next_hour = (dt.hour + 1) % 12 or 12 + speak = pronounce_number_sl( + 60 - dt.minute) + " do " + _hour_declension(next_hour) + + if use_ampm: + if dt.hour > 11: + speak += " p.m." + else: + speak += " a.m." + + return speak diff --git a/lingua_franca/lang/format_sv.py b/lingua_franca/lang/format_sv.py new file mode 100644 index 0000000..260f03b --- /dev/null +++ b/lingua_franca/lang/format_sv.py @@ -0,0 +1,376 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_sv import _EXTRA_SPACE_SV, \ + _FRACTION_STRING_SV, _MONTHS_SV, _NUM_POWERS_OF_TEN_SV, _NUM_STRING_SV +from math import floor + + +def nice_number_sv(number, speech=True, denominators=range(1, 21)): + """ Swedish helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 och en halv" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_SV[den] + if whole == 0: + if num == 1: + return_string = 'en {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} och en {}'.format(whole, den_str) + else: + return_string = '{} och {} {}'.format(whole, num, den_str) + if num > 1: + return_string += 'ar' + return return_string + + +def pronounce_number_sv(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # TODO short_scale, scientific and ordinals + # currently ignored + + def pronounce_triplet_sv(num): + result = "" + num = floor(num) + + if num > 99: + hundreds = floor(num / 100) + if hundreds > 0: + if hundreds == 1: + result += 'ett' + 'hundra' + else: + result += _NUM_STRING_SV[hundreds] + 'hundra' + + num -= hundreds * 100 + + if num == 0: + result += '' # do nothing + elif num == 1: + result += 'ett' + elif num <= 20: + result += _NUM_STRING_SV[num] + elif num > 20: + tens = num % 10 + ones = num - tens + + if ones > 0: + result += _NUM_STRING_SV[ones] + if tens > 0: + result += _NUM_STRING_SV[tens] + + return result + + def pronounce_fractional_sv(num, places): + # fixed number of places even with trailing zeros + result = "" + place = 10 + while places > 0: + # doesn't work with 1.0001 and places = 2: int( + # num*place) % 10 > 0 and places > 0: + result += " " + _NUM_STRING_SV[int(num * place) % 10] + place *= 10 + places -= 1 + return result + + def pronounce_whole_number_sv(num, scale_level=0): + if num == 0: + return '' + + num = floor(num) + result = '' + last_triplet = num % 1000 + + if last_triplet == 1: + if scale_level == 0: + if result != '': + result += '' + 'ett' + else: + result += 'en' + elif scale_level == 1: + result += 'ettusen' + _EXTRA_SPACE_SV + else: + result += 'en ' + \ + _NUM_POWERS_OF_TEN_SV[scale_level] + _EXTRA_SPACE_SV + elif last_triplet > 1: + result += pronounce_triplet_sv(last_triplet) + if scale_level == 1: + result += 'tusen' + _EXTRA_SPACE_SV + if scale_level >= 2: + result += _NUM_POWERS_OF_TEN_SV[scale_level] + if scale_level >= 2: + result += 'er' + _EXTRA_SPACE_SV # MiljonER + + num = floor(num / 1000) + scale_level += 1 + return pronounce_whole_number_sv(num, scale_level) + result + + result = "" + if abs(number) >= 1000000000000000000000000: # cannot do more than this + return str(number) + elif number == 0: + return str(_NUM_STRING_SV[0]) + elif number < 0: + return "minus " + pronounce_number_sv(abs(number), places) + else: + if number == int(number): + return pronounce_whole_number_sv(number) + else: + whole_number_part = floor(number) + fractional_part = number - whole_number_part + result += pronounce_whole_number_sv(whole_number_part) + if places > 0: + result += " komma" + result += pronounce_fractional_sv(fractional_part, places) + return result + + +def pronounce_ordinal_sv(number): + """ + This function pronounces a number as an ordinal + + 1 -> first + 2 -> second + + Args: + number (int): the number to format + Returns: + (str): The pronounced number string. + """ + + # ordinals for 1, 3, 7 and 8 are irregular + # this produces the base form, it will have to be adapted for genus, + # casus, numerus + + ordinals = ["noll", "första", "andra", "tredje", "fjärde", "femte", + "sjätte", "sjunde", "åttonde", "nionde", "tionde"] + + tens = int(floor(number / 10.0)) * 10 + ones = number % 10 + + if number < 0 or number != int(number): + return number + if number == 0: + return ordinals[number] + + result = "" + if number > 10: + result += pronounce_number_sv(tens).rstrip() + + if ones > 0: + result += ordinals[ones] + else: + result += 'de' + + return result + + +def nice_time_sv(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + if dt.hour == 1: + speak += "ett" # 01:00 is "ett" not "en" + else: + speak += pronounce_number_sv(dt.hour) + if not dt.minute == 0: + if dt.minute < 10: + speak += ' noll' + + if dt.minute == 1: + speak += ' ett' + else: + speak += " " + pronounce_number_sv(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + hour = dt.hour + + if not dt.minute == 0: + if dt.minute < 30: + if dt.minute != 15: + speak += pronounce_number_sv(dt.minute) + else: + speak += 'kvart' + + if dt.minute == 1: + speak += ' minut över ' + elif dt.minute != 10 and dt.minute != 5 and dt.minute != 15: + speak += ' minuter över ' + else: + speak += ' över ' + elif dt.minute > 30: + if dt.minute != 45: + speak += pronounce_number_sv((60 - dt.minute)) + else: + speak += 'kvart' + + if dt.minute == 1: + speak += ' minut i ' + elif dt.minute != 50 and dt.minute != 55 and dt.minute != 45: + speak += ' minuter i ' + else: + speak += ' i ' + + hour = (hour + 1) % 12 + elif dt.minute == 30: + speak += 'halv ' + hour = (hour + 1) % 12 + + if hour == 0 and dt.minute == 0: + return "midnatt" + if hour == 12 and dt.minute == 0: + return "middag" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if hour == 0: + speak += pronounce_number_sv(12) + elif hour <= 13: + if hour == 1 or hour == 13: # 01:00 and 13:00 is "ett" + speak += 'ett' + else: + speak += pronounce_number_sv(hour) + else: + speak += pronounce_number_sv(hour - 12) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + # 12:01 - 17:59 nachmittags/afternoon + speak += " på eftermiddagen" + elif dt.hour < 22: + # 18:00 - 21:59 abends/evening + speak += " på kvällen" + else: + # 22:00 - 23:59 nachts/at night + speak += " på natten" + elif dt.hour < 3: + # 00:01 - 02:59 nachts/at night + speak += " på natten" + else: + # 03:00 - 11:59 morgens/in the morning + speak += " på morgonen" + + return speak + + +def nice_response_sv(text): + # check for months and call _nice_ordinal_sv declension of ordinals + # replace "^" with "hoch" (to the power of) + words = text.split() + + for idx, word in enumerate(words): + if word.lower() in _MONTHS_SV: + text = _nice_ordinal_sv(text) + + if word == '^': + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + if wordNext.isnumeric(): + words[idx] = "upphöjt till" + text = " ".join(words) + return text + + +def _nice_ordinal_sv(text, speech=True): + # check for months for declension of ordinals before months + # depending on articles/prepositions + normalized_text = text + words = text.split() + + for idx, word in enumerate(words): + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordPrev = words[idx - 1] if idx > 0 else "" + if word[-1:] == ".": + if word[:-1].isdecimal(): + if wordNext.lower() in _MONTHS_SV: + word = pronounce_ordinal_sv(int(word[:-1])) + if wordPrev.lower() in ["om", "den", "från", "till", + "(från", "(om", "till"]: + word += "n" + elif wordPrev.lower() not in ["den"]: + word += "r" + words[idx] = word + normalized_text = " ".join(words) + return normalized_text diff --git a/lingua_franca/lang/parse_ca.py b/lingua_franca/lang/parse_ca.py new file mode 100644 index 0000000..1108a42 --- /dev/null +++ b/lingua_franca/lang/parse_ca.py @@ -0,0 +1,1132 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" + Parse functions for Catalan (ca-ES) + + TODO: numbers greater than 999999 + TODO: date time ca +""" +from datetime import datetime +from dateutil.relativedelta import relativedelta +from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions +from lingua_franca.lang.common_data_ca import _NUMBERS_CA, \ + _FEMALE_DETERMINANTS_CA, _FEMALE_ENDINGS_CA, \ + _MALE_DETERMINANTS_CA, _MALE_ENDINGS_CA, _GENDERS_CA, \ + _TENS_CA, _AFTER_TENS_CA, _HUNDREDS_CA, _BEFORE_HUNDREDS_CA +from lingua_franca.internal import resolve_resource_file +from lingua_franca.lang.parse_common import Normalizer +import json +import re + + +def is_fractional_ca(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('é', -1): + input_str = input_str[:len(input_str) - 1] + "è" # e.g. "cinqué -> cinquè" + elif input_str.endswith('ena', -3): + input_str = input_str[:len(input_str) - 3] + "è" # e.g. "cinquena -> cinquè" + elif input_str.endswith('ens', -3): + input_str = input_str[:len(input_str) - 3] + "è" # e.g. "cinquens -> cinquè" + elif input_str.endswith('enes', -4): + input_str = input_str[:len(input_str) - 4] + "è" # e.g. "cinquenes -> cinquè" + elif input_str.endswith('os', -2): + input_str = input_str[:len(input_str) - 2] # e.g. "terços -> terç" + elif (input_str == 'terceres' or input_str == 'tercera'): + input_str = "terç" # e.g. "tercer -> terç" + elif (input_str == 'mitges' or input_str == 'mitja'): + input_str = "mig" # e.g. "mitges -> mig" + elif (input_str == 'meitat' or input_str == 'meitats'): + input_str = "mig" # e.g. "mitges -> mig" + elif input_str.endswith('a', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "quarta -> quart" + elif input_str.endswith('es', -2): + input_str = input_str[:len(input_str) - 2] # e.g. "quartes -> quartes" + elif input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "quarts -> quart" + + + aFrac = ["mig", "terç", "quart", "cinquè", "sisè", "sètè", "vuitè", "novè", + "desè", "onzè", "dotzè", "tretzè", "catorzè", "quinzè", "setzè", + "dissetè", "divuitè", "dinovè"] + + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 2) + if input_str == "vintè": + return 1.0 / 20 + if input_str == "trentè": + return 1.0 / 30 + if input_str == "centè": + return 1.0 / 100 + if input_str == "milè": + return 1.0 / 1000 + if (input_str == "vuitè" or input_str == "huitè"): + return 1.0 / 8 + if (input_str == "divuitè" or input_str == "dihuitè"): + return 1.0 / 18 + + return False + + +def extract_number_ca(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + text = text.lower() + aWords = text.split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in _NUMBERS_CA: + val = _NUMBERS_CA[word] + elif '-' in word: + wordparts = word.split('-') + # trenta-cinc > 35 + if len(wordparts) == 2 and (wordparts[0] in _TENS_CA and wordparts[1] in _AFTER_TENS_CA): + val = _TENS_CA[wordparts[0]] + _AFTER_TENS_CA[wordparts[1]] + # vint-i-dues > 22 + elif len(wordparts) == 3 and wordparts[1] == 'i' and (wordparts[0] in _TENS_CA and wordparts[2] in _AFTER_TENS_CA): + val = _TENS_CA[wordparts[0]]+_AFTER_TENS_CA[wordparts[2]] + # quatre-centes > 400 + elif len(wordparts) == 2 and (wordparts[0] in _BEFORE_HUNDREDS_CA and wordparts[1] in _HUNDREDS_CA): + val = _BEFORE_HUNDREDS_CA[wordparts[0]]*100 + + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif is_fractional_ca(word): + if not result: + result = 1 + result = result * is_fractional_ca(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + #TODO: caution, review use of "ens" word + if next_word != "ens": + result += val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["i"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extract_number_ca(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + for _ in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extract_number_ca(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["coma", "amb", "punt", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extract_number_ca(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result or False + + +class CatalanNormalizer(Normalizer): + with open(resolve_resource_file("text/ca-es/normalize.json")) as f: + _default_config = json.load(f) + + @staticmethod + def tokenize(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Don't split things like amo-te + #utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \3", + # utterance) + tokens = utterance.split() + if tokens[-1] == '-': + tokens = tokens[:-1] + + return tokens + + +def normalize_ca(text, remove_articles=True): + """ CA string normalization """ + return CatalanNormalizer().normalize(text, remove_articles) + + +def extract_datetime_ca(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + hyphens = ["'", "_"] + noise_words = ["el", "l", "els", "la", "les", "es", "sa", "ses", + "d", "de", "del", "dels"] + # add final space + s = s + " " + + s = s.lower() + + for word in symbols: + s = s.replace(word, "") + + for word in hyphens: + s = s.replace(word, " ") + + for word in noise_words: + s = s.replace(" " + word + " ", " ") + + + # handle synonims, plurals and equivalents, "demà ben d'hora" = "demà de matí" + synonims = {"abans": ["abans-d"], + "vinent": ["que vé", "que ve", "que bé", "que be"], + "migdia": ["mig dia"], + "mitjanit": ["mitja nit"], + "matinada": ["matinades", "ben hora ben hora"], + "matí": ["matins", "dematí", "dematins", "ben hora"], + "tarda": ["tardes", "vesprada", "vesprades", "vespraes"], + "nit": ["nits", "vespre", "vespres", "horabaixa", "capvespre"], + "demà": ["endemà"], + "diàriament": ["diària", "diàries", "cada dia", "tots dies"], + "setmanalment": ["setmanal", "setmanals", "cada setmana", "totes setmanes"], + "quinzenalment": ["quinzenal", "quinzenals", "cada quinzena", "totes quinzenes"], + "mensualment": ["mensual", "mensuals", "cada mes", "tots mesos"], + "anualment": ["anual", "anuals", "cada any", "tots anys"], + "demàpassat": ["demà-passat", "demà passat", "passat demà", "despús-demà", "despús demà"], + "demàpassatpassat": ["demàpassat passat", "passat demàpassat", + "demàpassat no altre", "demàpassat altre"], + "abansahir": ["abans ahir", "despús ahir", "despús-ahir"], + "abansabansahir": ["abans abansahir", "abansahir no altre", "abansahir altre", + "abansahir no altre", "abansahir altre"], + "segon": ["segons"], + "minut": ["minuts"], + "quart": ["quarts"], + "hora": ["hores"], + "dia": ["dies"], + "setmana": ["setmanes"], + "quinzena": ["quinzenes"], + "mes": ["mesos"], + "any": ["anys"], + "tocat": ["tocats"], + "a": ["al", "als"] + } + for syn in synonims: + for word in synonims[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + + # remove final space + if s[-1] == " ": + s = s[:-1] + + + return s + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['matí', 'tarda', 'nit'] + time_indicators = ["em", "a", "a les", "cap a", "vora", "després", "estas", + "no", "dia", "hora"] + days = ['dilluns', 'dimarts', 'dimecres', + 'dijous', 'divendres', 'dissabte', 'diumenge'] + months = ['gener', 'febrer', 'març', 'abril', 'maig', 'juny', + 'juliol', 'agost', 'setembre', 'octubre', 'novembre', + 'desembre'] + monthsShort = ['gen', 'feb', 'març', 'abr', 'maig', 'juny', 'jul', 'ag', + 'set', 'oct', 'nov', 'des'] + nexts = ["pròxim", "pròxima", "vinent"] + suffix_nexts = ["següent", "després"] + lasts = ["últim", "última", "darrer", "darrera", "passat", "passada"] + suffix_lasts = ["passada", "passat", "anterior", "abans"] + nxts = ["passat", "després", "segueix", "seguit", "seguida", "següent", "pròxim", "pròxima"] + prevs = ["abans", "prèvia", "previamente", "anterior"] + froms = ["partir", "dins", "des", "a", + "després", "pròxima", "pròxim", "del", "de"] + thises = ["aquest", "aquesta", "aqueix", "aqueixa", "este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "avui" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "demà" and not fromFlag: + dayOffset += 1 + used += 1 + elif word == "ahir" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "abansahir") and not fromFlag: + dayOffset -= 2 + used += 1 + elif word == "abansabansahir" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow and after after tomorrow + elif word == "demàpassat" and not fromFlag: + dayOffset += 2 + used = 1 + elif word == "demàpassatpassat" and not fromFlag: + dayOffset += 3 + used = 1 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "dia": + if wordNext == "després" or wordNext == "abans": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "setmana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "any" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maig + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maig 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maig + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maig dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("avui") + validFollowups.append("demà") + validFollowups.append("ahir") + validFollowups.append("abansahir") + validFollowups.append("abansabansahir") + validFollowups.append("demàpassat") + validFollowups.append("ara") + validFollowups.append("ja") + validFollowups.append("abans") + + # TODO debug word "passat" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "demà" and wordNext == "ahir") and not ( + word == "passat" or word == "abans" or word == "em"): + used = 2 + fromFlag = True + if wordNext == "demà": + dayOffset += 1 + elif wordNext == "ahir": + dayOffset -= 1 + elif wordNext == "abansahir": + dayOffset -= 2 + elif wordNext == "abansabansahir": + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "dia": + used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + if wordNextNextNext == "dia": + used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "migdia": + hrAbs = 12 + used += 1 + elif word == "mijanit": + hrAbs = 0 + used += 1 + elif word == "matí": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarda": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "mitja" and wordNext == "tarda": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "mig" and wordNext == "matí": + if not hrAbs: + hrAbs = 10 + used += 2 + elif word == "vespre" or (word == "final" and wordNext == "tarda"): + if not hrAbs: + hrAbs = 19 + used += 2 + elif word == "final" and wordNext == "matí": + if not hrAbs: + hrAbs = 11 + used += 2 + elif word == "matinada": + if not hrAbs: + hrAbs = 4 + used += 1 + elif word == "nit": + if not hrAbs: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "mitja": + minOffset = 30 + elif wordPrev == "quart": + minOffset = 15 + elif wordPrevPrev == "quart": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "matí": + remainder = "am" + used += 1 + elif (wordNext == "tarda" or wordNext == "vespre"): + remainder = "pm" + used += 1 + elif wordNext == "nit": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "matí": + remainder = "am" + used = 2 + elif wordNext in thises and (wordNextNext == "tarda" or wordNextNext == "vespre"): + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "nit": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "matí" or + timeQualifier == "tarda"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarda" or + wordNext == "vespre"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "matí"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minut": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segon": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punt"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punt": + used += 2 + if (wordNextNextNext == "tarda" or wordNextNextNext == "vespre"): + remainder = "pm" + used += 1 + elif wordNextNextNext == "matí": + remainder = "am" + used += 1 + elif wordNextNextNext == "nit": + if 0 > int(strHH) > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punt": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if (hrAbs or 0) != -1 and (minAbs or 0) != -1: + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + resultStr = _ca_pruning(resultStr) + return [extractedDate, resultStr] + + +def _ca_pruning(text, symbols=True, accents=False, agressive=True): + # agressive ca word pruning + words = ["l", "la", "el", "els", "les", "de", "dels", + "ell", "ells", "me", "és", "som", "al", "a", "dins", "per", + "aquest", "aquesta", "això", "aixina", "en", "aquell", "aquella", + "va", "vam", "vaig", "quin", "quina"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", "¡", "¿"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("'", " ").replace("_", " ") + # accents=False + if accents: + accents = {"a": ["á", "à", "ã", "â"], + "e": ["ê", "è", "é"], + "i": ["í", "ï"], + "o": ["ò", "ó"], + "u": ["ú", "ü"], + "c": ["ç"], + "ll": ["l·l"], + "n": ["ñ"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text + + +def get_gender_ca(word, context=""): + """ Guess the gender of a word + + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + # parse gender taking context into account + word = word.lower() + words = context.lower().split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + # in Catalan usually the previous word (a determinant) + # assigns gender to the next word + previous = words[idx - 1].lower() + if previous in _MALE_DETERMINANTS_CA: + return "m" + elif previous in _FEMALE_DETERMINANTS_CA: + return "f" + + # get gender using only the individual word + # see if this word has the gender defined + if word in _GENDERS_CA: + return _GENDERS_CA[word] + singular = word.rstrip("s") + if singular in _GENDERS_CA: + return _GENDERS_CA[singular] + # in Catalan the last vowel usually dosn't defines the gender of a word + # the gender of the determinant takes precedence over this rule + for end_str in _FEMALE_ENDINGS_CA: + if word.endswith(end_str): + return "f" + for end_str in _MALE_ENDINGS_CA: + if word.endswith(end_str): + return "m" + return None diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py new file mode 100644 index 0000000..81c97f5 --- /dev/null +++ b/lingua_franca/lang/parse_common.py @@ -0,0 +1,387 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import namedtuple +import re + + +class Normalizer: + """ + individual languages may subclass this if needed + + normalize_XX should pass a valid config read from json + """ + _default_config = {} + + def __init__(self, config=None): + self.config = config or self._default_config + + @staticmethod + def tokenize(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split thins like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + return utterance.split() + + @property + def should_lowercase(self): + return self.config.get("lowercase", False) + + @property + def should_numbers_to_digits(self): + return self.config.get("numbers_to_digits", True) + + @property + def should_expand_contractions(self): + return self.config.get("expand_contractions", True) + + @property + def should_remove_symbols(self): + return self.config.get("remove_symbols", False) + + @property + def should_remove_accents(self): + return self.config.get("remove_accents", False) + + @property + def should_remove_articles(self): + return self.config.get("remove_articles", False) + + @property + def should_remove_stopwords(self): + return self.config.get("remove_stopwords", False) + + @property + def contractions(self): + return self.config.get("contractions", {}) + + @property + def word_replacements(self): + return self.config.get("word_replacements", {}) + + @property + def number_replacements(self): + return self.config.get("number_replacements", {}) + + @property + def accents(self): + return self.config.get("accents", + {"á": "a", "à": "a", "ã": "a", "â": "a", + "é": "e", "è": "e", "ê": "e", "ẽ": "e", + "í": "i", "ì": "i", "î": "i", "ĩ": "i", + "ò": "o", "ó": "o", "ô": "o", "õ": "o", + "ú": "u", "ù": "u", "û": "u", "ũ": "u", + "Á": "A", "À": "A", "Ã": "A", "Â": "A", + "É": "E", "È": "E", "Ê": "E", "Ẽ": "E", + "Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I", + "Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O", + "Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U" + }) + + @property + def stopwords(self): + return self.config.get("stopwords", []) + + @property + def articles(self): + return self.config.get("articles", []) + + @property + def symbols(self): + return self.config.get("symbols", + [";", "_", "!", "?", "<", ">", + "|", "(", ")", "=", "[", "]", "{", + "}", "»", "«", "*", "~", "^", "`"]) + + def expand_contractions(self, utterance): + """ Expand common contractions, e.g. "isn't" -> "is not" """ + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.contractions: + words[idx] = self.contractions[w] + utterance = " ".join(words) + return utterance + + def numbers_to_digits(self, utterance): + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.number_replacements: + words[idx] = self.number_replacements[w] + utterance = " ".join(words) + return utterance + + def remove_articles(self, utterance): + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.articles: + words[idx] = "" + utterance = " ".join(words) + return utterance + + def remove_stopwords(self, utterance): + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.stopwords: + words[idx] = "" + # if words[-1] == '-': + # words = words[:-1] + utterance = " ".join(words) + # Remove trailing whitespaces from utterance along with orphaned + # hyphens, more characters may be added later + utterance = re.sub(r'- *$', '', utterance) + return utterance + + def remove_symbols(self, utterance): + for s in self.symbols: + utterance = utterance.replace(s, " ") + return utterance + + def remove_accents(self, utterance): + for s in self.accents: + utterance = utterance.replace(s, self.accents[s]) + return utterance + + def replace_words(self, utterance): + words = self.tokenize(utterance) + for idx, w in enumerate(words): + if w in self.word_replacements: + words[idx] = self.word_replacements[w] + utterance = " ".join(words) + return utterance + + def normalize(self, utterance="", remove_articles=None): + # mutations + if self.should_lowercase: + utterance = utterance.lower() + if self.should_expand_contractions: + utterance = self.expand_contractions(utterance) + if self.should_numbers_to_digits: + utterance = self.numbers_to_digits(utterance) + utterance = self.replace_words(utterance) + + # removals + if self.should_remove_symbols: + utterance = self.remove_symbols(utterance) + if self.should_remove_accents: + utterance = self.remove_accents(utterance) + # TODO deprecate remove_articles param, backwards compat + if remove_articles is not None and remove_articles: + utterance = self.remove_articles(utterance) + elif self.should_remove_articles: + utterance = self.remove_articles(utterance) + if self.should_remove_stopwords: + utterance = self.remove_stopwords(utterance) + # remove extra spaces + utterance = " ".join([w for w in utterance.split(" ") if w]) + return utterance + + +# Token is intended to be used in the number processing functions in +# this module. The parsing requires slicing and dividing of the original +# text. To ensure things parse correctly, we need to know where text came +# from in the original input, hence this nametuple. +Token = namedtuple('Token', 'word index') + + +class ReplaceableNumber: + """ + Similar to Token, this class is used in number parsing. + + Once we've found a number in a string, this class contains all + the info about the value, and where it came from in the original text. + In other words, it is the text, and the number that can replace it in + the string. + """ + + def __init__(self, value, tokens: [Token]): + self.value = value + self.tokens = tokens + + def __bool__(self): + return bool(self.value is not None and self.value is not False) + + @property + def start_index(self): + return self.tokens[0].index + + @property + def end_index(self): + return self.tokens[-1].index + + @property + def text(self): + return ' '.join([t.word for t in self.tokens]) + + def __setattr__(self, key, value): + try: + getattr(self, key) + except AttributeError: + super().__setattr__(key, value) + else: + raise Exception("Immutable!") + + def __str__(self): + return "({v}, {t})".format(v=self.value, t=self.tokens) + + def __repr__(self): + return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value, + t=self.tokens) + + +def tokenize(text): + """ + Generate a list of token object, given a string. + Args: + text str: Text to tokenize. + + Returns: + [Token] + + """ + return [Token(word, index) + for index, word in enumerate(Normalizer.tokenize(text))] + + +def partition_list(items, split_on): + """ + Partition a list of items. + + Works similarly to str.partition + + Args: + items: + split_on callable: + Should return a boolean. Each item will be passed to + this callable in succession, and partitions will be + created any time it returns True. + + Returns: + [[any]] + + """ + splits = [] + current_split = [] + for item in items: + if split_on(item): + splits.append(current_split) + splits.append([item]) + current_split = [] + else: + current_split.append(item) + splits.append(current_split) + return list(filter(lambda x: len(x) != 0, splits)) + + +def invert_dict(original): + """ + Produce a dictionary with the keys and values + inverted, relative to the dict passed in. + + Args: + original dict: The dict like object to invert + + Returns: + dict + + """ + return {value: key for key, value in original.items()} + + +def is_numeric(input_str): + """ + Takes in a string and tests to see if it is a number. + Args: + text (str): string to test if a number + Returns: + (bool): True if a number, else False + + """ + + try: + float(input_str) + return True + except ValueError: + return False + + +def look_for_fractions(split_list): + """" + This function takes a list made by fraction & determines if a fraction. + + Args: + split_list (list): list created by splitting on '/' + Returns: + (bool): False if not a fraction, otherwise True + + """ + + if len(split_list) == 2: + if is_numeric(split_list[0]) and is_numeric(split_list[1]): + return True + + return False + + +def extract_numbers_generic(text, pronounce_handler, extract_handler, + short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + Language agnostic, per language parsers need to be provided + + Args: + text (str): the string to extract a number from + pronounce_handler (function): function that pronounces a number + extract_handler (function): function that extracts the last number + present in a string + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + numbers = [] + normalized = text + extract = extract_handler(normalized, short_scale, ordinals) + to_parse = normalized + while extract: + numbers.append(extract) + prev = to_parse + num_txt = pronounce_handler(extract) + extract = str(extract) + if extract.endswith(".0"): + extract = extract[:-2] + + # handle duplicate occurences, replace last one only + def replace_right(source, target, replacement, replacements=None): + return replacement.join(source.rsplit(target, replacements)) + + normalized = replace_right(normalized, num_txt, extract, 1) + # last biggest number was replaced, recurse to handle cases like + # test one two 3 + to_parse = replace_right(to_parse, num_txt, extract, 1) + to_parse = replace_right(to_parse, extract, " ", 1) + if to_parse == prev: + # avoid infinite loops, occasionally pronounced number may be + # different from extracted text, + # ie pronounce(0.5) != half and extract(half) == 0.5 + extract = False + # TODO fix this + else: + extract = extract_handler(to_parse, short_scale, ordinals) + numbers.reverse() + return numbers diff --git a/lingua_franca/lang/parse_cs.py b/lingua_franca/lang/parse_cs.py new file mode 100644 index 0000000..e0144b0 --- /dev/null +++ b/lingua_franca/lang/parse_cs.py @@ -0,0 +1,1707 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_cs import _NUM_STRING_CS, \ + _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ + _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ + _ORDINAL_BASE_CS # _ARTICLES_CS + +import re +import json +from lingua_franca import resolve_resource_file +from lingua_franca.time import now_local + + +def generate_plurals_cs(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + if isinstance(originals, dict): + return {key + 'ý': value for key, value in originals.items()} + return {value + "ý" for value in originals} + + +# negate next number (-2 = 0 - 2) +_NEGATIVES = {"záporné", "mínus"} + +# sum the next number (twenty two = 20 + 2) +_SUMS = {'dvacet', '20', 'třicet', '30', 'čtyřicet', '40', 'padesát', '50', + 'šedesát', '60', 'sedmdesát', '70', 'osmdesát', '80', 'devadesát', '90'} + +_MULTIPLIES_LONG_SCALE_CS = set(_LONG_SCALE_CS.values()) | \ + generate_plurals_cs(_LONG_SCALE_CS.values()) + +_MULTIPLIES_SHORT_SCALE_CS = set(_SHORT_SCALE_CS.values()) | \ + generate_plurals_cs(_SHORT_SCALE_CS.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER = {"a"} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER = {"bod", "tečka", "čárka", "celá"} + +_STRING_NUM_CS = invert_dict(_NUM_STRING_CS) +_STRING_NUM_CS.update(generate_plurals_cs(_STRING_NUM_CS)) +_STRING_NUM_CS.update({ + "polovina": 0.5, + "půlka": 0.5, + "půl": 0.5, + "jeden": 1, + "dvojice": 2, + "dvoje": 2 +}) + +_STRING_SHORT_ORDINAL_CS = invert_dict(_SHORT_ORDINAL_CS) +_STRING_LONG_ORDINAL_CS = invert_dict(_LONG_ORDINAL_CS) + + +def _convert_words_to_numbers_cs(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + text = text.lower() + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_cs(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_cs(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_cs(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_cs(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_cs_helper(tokens, short_scale, + ordinals, fractional_numbers) + # while tokens and tokens[0].word in _ARTICLES_CS: + # tokens.pop(0) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_cs_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_cs(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_cs(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_cs(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_cs(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_cs(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_cs(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_cs(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extract_number_xx, it also depends on + extract_number_xx, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_cs(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_cs(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_cs(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data(short_scale) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + # if word in _ARTICLES_CS or word in _NEGATIVES: + if word in word in _NEGATIVES: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + # In czech we do no use suffix (1st,2nd,..) but use point instead (1.,2.,..) + if is_numeric(word[:-1]) and \ + (word.endswith(".")): + + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + word = word[:-1] + + # handle nth one + # if next_word == "one": + # would return 1 instead otherwise + # tokens[idx + 1] = Token("", idx) + # next_word = "" + + # Normalize Czech inflection of numbers(jedna,jeden,jedno,...) + if not ordinals: + word = _text_cs_inflection_normalize(word, 1) + + if word not in string_num_scale and \ + word not in _STRING_NUM_CS and \ + word not in _SUMS and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not isFractional_cs(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + # if number_words and not all([w in _ARTICLES_CS | + # _NEGATIVES for w in words_only]): + if number_words and not all([w in _NEGATIVES for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SUMS \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES: # \ + # and prev_word not in _ARTICLES_CS: + number_words = [token] + elif prev_word in _SUMS and word in _SUMS: + number_words = [token] + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_CS: + val = _STRING_NUM_CS.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in _SUMS and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + + # For Czech only: If Ordinal previous number will be also in ordinal number format + # dvacátý první = twentieth first + if (prev_word in string_num_ordinal and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # half cup + if val is False: + val = isFractional_cs(word, short_scale=short_scale) + current_val = val + + # 2 fifths + if not ordinals: + next_val = isFractional_cs(next_word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + + else: + if all([ + prev_word in _SUMS, + word not in _SUMS, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx+1:]: + if other_token.word in multiplies: + if string_num_scale[other_token.word] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data(short_scale): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_CS if short_scale \ + else _MULTIPLIES_LONG_SCALE_CS + + string_num_ordinal_cs = _STRING_SHORT_ORDINAL_CS if short_scale \ + else _STRING_LONG_ORDINAL_CS + + string_num_scale_cs = _SHORT_SCALE_CS if short_scale else _LONG_SCALE_CS + string_num_scale_cs = invert_dict(string_num_scale_cs) + string_num_scale_cs.update(generate_plurals_cs(string_num_scale_cs)) + return multiplies, string_num_ordinal_cs, string_num_scale_cs + + +def extract_number_cs(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_cs(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_cs(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Czech inflection for time: minuta,minuty,minut - safe to use minut as pattern + # For day: den, dny, dnů - short patern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ay]?" + text = _convert_words_to_numbers_cs(text) + + for (unit_cs, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_cs) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_cs(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize czech inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("dvoje", "2").replace("dvojice", "2") \ + .replace("dnes večer", "večer").replace("dnes v noci", "noci") # \ + # .replace("tento večer", "večer") + # .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ + # .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ + # .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ + # .replace("decades", "decade") \ + # .replace("tisíciletí", "milénium") + # .replace("oclock", "o'clock") + wordList = s.split() + + for idx, word in enumerate(wordList): + #word = word.replace("'s", "") + ########## + # Czech Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: třicátého prvního > 31 + count_ordinals = 0 + if word == "prvního": + count_ordinals = 1 # These two have different format + elif word == "třetího": + count_ordinals = 3 + elif word.endswith("ého"): + tmp = word[:-3] + tmp += ("ý") + for nr, name in _ORDINAL_BASE_CS.items(): + if name == tmp: + count_ordinals = nr + + # If number is bigger than 19 chceck if next word is also ordinal + # and count them together + if count_ordinals > 19: + if wordList[idx+1] == "prvního": + count_ordinals += 1 # These two have different format + elif wordList[idx+1] == "třetího": + count_ordinals += 3 + elif wordList[idx+1].endswith("ého"): + tmp = wordList[idx+1][:-3] + tmp += ("ý") + for nr, name in _ORDINAL_BASE_CS.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized valu into word + if count_ordinals > 20: + # If counted number is grather than 20, clear next word so it is not used again + wordList[idx+1] = "" + ########## + # Remove inflection from czech months + + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['ráno', 'dopoledne'] + timeQualifiersPM = ['odpoledne', 'večer', 'noc', 'noci'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['na', 'v', 'do', 'na', 'tento', + 'okolo', 'toto', 'během', 'za', 'této'] + days = ['pondělí', 'úterý', 'středa', + 'čtvrtek', 'pátek', 'sobota', 'neděle'] + months = _MONTHS_CZECH + recur_markers = days + [d + 'ho' for d in days] + \ + ['víkend', 'všední'] # Check this + monthsShort = ['led', 'úno', 'bře', 'dub', 'kvě', 'čvn', 'čvc', 'srp', + 'zář', 'říj', 'lis', 'pro'] + year_multiples = ["desetiletí", "století", "tisíciletí"] + day_multiples = ["týden", "měsíc", "rok"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + + word = _text_cs_inflection_normalize(word, 2) + wordPrevPrev = _text_cs_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + wordPrev = _text_cs_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + wordNext = _text_cs_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + wordNextNext = _text_cs_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + #word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + # if word == "před" and dayOffset: + # dayOffset = - dayOffset + # used += 1 + if word == "nyní" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_cs(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "desetiletí": + yearOffset = multiplier * 10 + elif wordNext == "století": + yearOffset = multiplier * 100 + elif wordNext == "tisíciletí": + yearOffset = multiplier * 1000 + # couple of + elif word == "2" and wordNext == "krát" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "desetiletí": + yearOffset = multiplier * 10 + elif wordNextNext == "století": + yearOffset = multiplier * 100 + elif wordNextNext == "tisíciletí": + yearOffset = multiplier * 1000 + elif word == "2" and wordNext == "krát" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "rok": + yearOffset = multiplier + elif wordNextNext == "měsíc": + monthOffset = multiplier + elif wordNextNext == "týden": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dnes" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "zítra" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "den" and wordNext == "před" and wordNextNext == "včera" and not fromFlag: + dayOffset = -2 + used += 3 + elif word == "před" and wordNext == "včera" and not fromFlag: + dayOffset = -2 + used += 2 + elif word == "včera" and not fromFlag: + dayOffset = -1 + used += 1 + elif (word == "den" and + wordNext == "po" and + wordNextNext == "zítra" and + not fromFlag and + (not wordPrev or not wordPrev[0].isdigit())): + dayOffset = 2 + used = 3 + if wordPrev == "ten": + start -= 1 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "den": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + if wordPrevPrev == "před": + dayOffset = -dayOffset + used += 1 + start -= 1 + + elif word == "týden" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "poslední": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "měsíc" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poslední": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "rok" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poslední": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "další" or wordPrev == "příští": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "poslední": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + # Convert czech months to english + datestr = _MONTHS_CONVERSION.get(m) + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == " " and wordPrevPrev[0].isdigit())): + if wordPrev == " " and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # if no date indicators found, it may not be the month of May + # may "i/we" ... + # "... may be" + # elif word == 'may' and wordNext in ['i', 'we', 'be']: + # datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("dnes") + validFollowups.append("zítra") + validFollowups.append("včera") + validFollowups.append("další") + validFollowups.append("příští") + validFollowups.append("poslední") + validFollowups.append("teď") + validFollowups.append("toto") + validFollowups.append("této") + validFollowups.append("tento") + if (word == "od" or word == "po" or word == "do") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "zítra": + dayOffset += 1 + elif wordNext == "včera": + dayOffset -= 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "další" or wordPrev == "příští": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "poslední": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and (words[start - 1] == "toto" or words[start - 1] == "této" or words[start - 1] == "tento"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + word = _text_cs_inflection_normalize(word, 2) + wordPrevPrev = _text_cs_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + wordPrev = _text_cs_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + wordNext = _text_cs_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + wordNextNext = _text_cs_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "poledne": + hrAbs = 12 + used += 1 + elif word == "půlnoc": + hrAbs = 0 + used += 1 + elif word == "ráno": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "odpoledne": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "večer": + if hrAbs is None: + hrAbs = 19 + used += 1 + if (wordNext != "" and wordNext[0].isdigit() and ":" in wordNext): + used -= 1 + elif word == "noci" or word == "noc": + if hrAbs is None: + hrAbs = 22 + #used += 1 + # if ((wordNext !='' and not wordNext[0].isdigit()) or wordNext =='') and \ + # ((wordNextNext !='' and not wordNextNext[0].isdigit())or wordNextNext =='') : + # used += 1 + # used += 1 ## NOTE this breaks other tests, TODO refactor me! + + # couple of time_unit + elif word == "2" and wordNext == "krát" and \ + wordNextNext in ["hodin", "minut", "sekund"]: + used += 3 + if wordNextNext == "hodin": + hrOffset = 2 + elif wordNextNext == "minut": + minOffset = 2 + elif wordNextNext == "sekund": + secOffset = 2 + # parse half an hour, quarter hour + elif word == "hodin" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "půl": + minOffset = 30 + elif wordPrev == "čtvrt": + minOffset = 15 + elif wordPrevPrev == "třičtvrtě": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + words[idx - 2] = "" + elif wordPrev == "během": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "tato" or wordPrevPrev == "této": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minut" and wordPrev == "za": + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "sekund" and wordPrev == "za": + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "večer" or wordNext == "noci" or wordNextNext == "večer" \ + or wordNextNext == "noci" or wordPrev == "večer" \ + or wordPrev == "noci" or wordPrevPrev == "večer" \ + or wordPrevPrev == "noci" or wordNextNextNext == "večer" \ + or wordNextNextNext == "noci": + remainder = "pm" + used += 1 + if wordPrev == "večer" or wordPrev == "noci": + words[idx - 1] = "" + if wordPrevPrev == "večer" or wordPrevPrev == "noci": + words[idx - 2] = "" + if wordNextNext == "večer" or wordNextNext == "noci": + used += 1 + if wordNextNextNext == "večer" or wordNextNextNext == "noci": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "ráno": + # remainder = "am" + # used += 3 + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "odpoledne": + # remainder = "pm" + # used += 3 + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "večer": + # remainder = "pm" + # used += 3 + elif wordNext == "ráno": + remainder = "am" + used += 2 + elif wordNext == "odpoledne": + remainder = "pm" + used += 2 + elif wordNext == "večer": + remainder = "pm" + used += 2 + elif wordNext == "toto" and wordNextNext == "ráno": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "na" and wordNextNext == "odpoledne": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "na" and wordNextNext == "večer": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "v" and wordNextNext == "noci": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if (int(strNum) > 100): # and #Check this + # ( + # wordPrev == "o" or + # wordPrev == "oh" + # )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hodin": + used += 1 + elif ( + (wordNext == "hodin" or + remainder == "hodin") and + word[0] != '0' and + # (wordPrev != "v" and wordPrev != "na") + wordPrev == "za" + and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "minut" or \ + remainder == "minut": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "sekund" \ + or remainder == "sekund": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hodin" or \ + remainder == "hodin": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "hodin" or + remainder == "hodin"): + used += 1 + elif ( + wordNext == "" or wordNext == "hodin" or + ( + (wordNext == "v" or wordNext == "na") and + ( + wordNextNext == timeQualifier + ) + ) or wordNext == 'večer' or + wordNextNext == 'večer'): + + strHH = strNum + strMM = "00" + if wordNext == "hodin": + used += 1 + if (wordNext == "v" or wordNext == "na" + or wordNextNext == "v" or wordNextNext == "na"): + used += (1 if (wordNext == + "v" or wordNext == "na") else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "hodin": + remainder = "" + + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + if (not military and + remainder not in ['am', 'pm', 'hodin', 'minut', 'sekund'] and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if wordPrev == "brzy": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "pozdě": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "toto" or wordPrev == "této": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "toto" or wordPrev == "této": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "a" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def isFractional_cs(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('iny', -3): # leading number is bigger than one ( one třetina, two třetiny) + # Normalize to format of one (třetiny > třetina) + input_str = input_str[:len(input_str) - 1] + "a" + + fracts = {"celá": 1} # first four numbers have little different format + + for num in _FRACTION_STRING_CS: # Numbers from 2 to 1 hundret, more is not usualy used in common speech + if num > 1: + fracts[_FRACTION_STRING_CS[num]] = num + + if input_str.lower() in fracts: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_cs(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_cs(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +class CzechNormalizer(Normalizer): + with open(resolve_resource_file("text/cs-cz/normalize.json"), encoding='utf8') as f: + _default_config = json.load(f) + + +def normalize_cs(text, remove_articles=True): + """ Czech string normalization """ + return CzechNormalizer().normalize(text, remove_articles) + + +def _text_cs_inflection_normalize(word, arg): + """ + Czech Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + if arg == 1: # _extract_whole_number_with_text_cs + # Number one (jedna) + if len(word) == 5 and word.startswith("jed"): + suffix = 'en', 'no', 'ny' + if word.endswith(suffix, 3): + word = "jedna" + + # Number two (dva) + elif word == "dvě": + word = "dva" + + elif arg == 2: # extract_datetime_cs TODO: This is ugly + if word == "hodina": + word = "hodin" + if word == "hodiny": + word = "hodin" + if word == "hodinu": + word = "hodin" + if word == "minuta": + word = "minut" + if word == "minuty": + word = "minut" + if word == "minutu": + word = "minut" + if word == "minutu": + word = "minut" + if word == "sekunda": + word = "sekund" + if word == "sekundy": + word = "sekund" + if word == "sekundu": + word = "sekund" + if word == "dní": + word = "den" + if word == "dnů": + word = "den" + if word == "dny": + word = "den" + if word == "týdny": + word = "týden" + if word == "týdnů": + word = "týden" + if word == "měsíců": + word = "měsíc" + if word == "měsíce": + word = "měsíc" + if word == "měsíci": + word = "měsíc" + if word == "roky": + word = "rok" + if word == "roků": + word = "rok" + if word == "let": + word = "rok" + if word == "včerejšku": + word = "včera" + if word == "zítřku": + word = "zítra" + if word == "zítřejší": + word = "zítra" + if word == "ranní": + word = "ráno" + if word == "dopolední": + word = "dopoledne" + if word == "polední": + word = "poledne" + if word == "odpolední": + word = "odpoledne" + if word == "večerní": + word = "večer" + if word == "noční": + word = "noc" + if word == "víkendech": + word = "víkend" + if word == "víkendu": + word = "víkend" + if word == "všedních": + word = "všední" + if word == "všedním": + word = "všední" + + # Months + if word == "únoru": + word = "únor" + elif word == "červenci": + word = "červenec" + elif word == "července": + word = "červenec" + elif word == "listopadu": + word = "listopad" + elif word == "prosinci": + word = "prosinec" + + elif word.endswith("nu") or word.endswith("na"): + tmp = word[:-2] + tmp += ("en") + for name in _MONTHS_CZECH: + if name == tmp: + word = name + + return word diff --git a/lingua_franca/lang/parse_da.py b/lingua_franca/lang/parse_da.py new file mode 100644 index 0000000..14b1813 --- /dev/null +++ b/lingua_franca/lang/parse_da.py @@ -0,0 +1,891 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime +from dateutil.relativedelta import relativedelta +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + extract_numbers_generic, Normalizer +from lingua_franca.lang.common_data_da import _DA_NUMBERS +from lingua_franca.lang.format_da import pronounce_number_da +from lingua_franca.time import now_local + + +def extract_number_da(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + + undefined articles cannot be suppressed in German: + 'ein Pferd' means 'one horse' and 'a horse' + + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + + text = text.lower() + aWords = text.split() + aWords = [word for word in aWords if + word not in ["den", "det"]] + and_pass = False + valPreAnd = False + val = False + count = 0 + while count < len(aWords): + word = aWords[count] + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = float(word) + elif is_fractional_da(word): + val = is_fractional_da(word) + elif is_ordinal_da(word): + val = is_ordinal_da(word) + else: + if word in _DA_NUMBERS: + val = _DA_NUMBERS[word] + if count < (len(aWords) - 1): + wordNext = aWords[count + 1] + else: + wordNext = "" + valNext = is_fractional_da(wordNext) + + if valNext: + val = val * valNext + aWords[count + 1] = "" + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + elif and_pass: + # added to value, quit here + val = valPreAnd + break + else: + count += 1 + continue + + aWords[count] = "" + + if and_pass: + aWords[count - 1] = '' # remove "og" + val += valPreAnd + elif count + 1 < len(aWords) and aWords[count + 1] == 'og': + and_pass = True + valPreAnd = val + val = False + count += 2 + continue + elif count + 2 < len(aWords) and aWords[count + 2] == 'og': + and_pass = True + valPreAnd = val + val = False + count += 3 + continue + + break + + return val or False + + +def extract_datetime_da(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation + and capitalization among other things. + + 'am' is a preposition, so cannot currently be used + for 12 hour date format + """ + + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' den ', ' ').replace(' det ', ' ').replace(' om ', + ' ').replace( + ' om ', ' ') \ + .replace(' på ', ' ').replace(' om ', ' ') + wordList = s.split() + + for idx, word in enumerate(wordList): + if is_ordinal_da(word) is not False: + word = str(is_ordinal_da(word)) + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['tidlig', + 'morgen', + 'morgenen', + 'formidag', + 'formiddagen', + 'eftermiddag', + 'eftermiddagen', + 'aften', + 'aftenen', + 'nat', + 'natten'] + markers = ['i', 'om', 'på', 'klokken', 'ved'] + days = ['mandag', 'tirsdag', 'onsdag', + 'torsdag', 'fredag', 'lørdag', 'søndag'] + months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'desember'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug', + 'sep', 'okt', 'nov', 'des'] + + validFollowups = days + months + monthsShort + validFollowups.append("i dag") + validFollowups.append("morgen") + validFollowups.append("næste") + validFollowups.append("forige") + validFollowups.append("nu") + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag and wordPrev != "om" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen + dayOffset = 1 + used += 1 + elif word == "overmorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dage": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "uge" or word == "uger" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev[:5] == "forige": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "måned" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:5] == "forige": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "år" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == " næste": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext == "morgen": + # morgen means morning if preceded by + # the day of the week + words[idx + 1] = "tidlig" + if wordPrev[:6] == "næste": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev[:5] == "forige": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + + if ( + word == "fra" or word == "til" or word == "om") and wordNext \ + in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgenen" and \ + wordPrev != "om" and \ + wordPrev not in days: + # morgen means tomorrow if not "am Morgen" and not + # [day of the week] morgen: + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext[:6] == "næste": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext[:5] == "forige": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1].startswith("denne"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word[:6] == "middag": + hrAbs = 12 + used += 1 + elif word[:11] == "midnat": + hrAbs = 0 + used += 1 + elif word == "morgenen" or ( + wordPrev == "om" and word == "morgenen") or word == "tidlig": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word[:11] == "eftermiddag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word[:5] == "aften": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif word == "time" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev[:4] == "halv": + minOffset = 30 + elif wordPrev == "kvarter": + minOffset = 15 + elif wordPrev == "trekvarter": + minOffset = 45 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "aften": + remainder = "pm" + used += 1 + elif wordNext == "om" and wordNextNext == "morgenen": + remainder = "am" + used += 2 + elif wordNext == "om" and wordNextNext == "eftermiddagen": + remainder = "pm" + used += 2 + elif wordNext == "om" and wordNextNext == "aftenen": + remainder = "pm" + used += 2 + elif wordNext == "morgen": + remainder = "am" + used += 1 + elif wordNext == "eftermiddag": + remainder = "pm" + used += 1 + elif wordNext == "aften": + remainder = "pm" + used += 1 + elif wordNext == "i" and wordNextNext == "morgen": + remainder = "am" + used = 2 + elif wordNext == "i" and wordNextNext == "eftermiddag": + remainder = "pm" + used = 2 + elif wordNext == "i" and wordNextNext == "aften": + remainder = "pm" + used = 2 + elif wordNext == "natten": + if strHH > 4: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "aftenen" or + timeQualifier == "eftermiddagen"): + strHH += 12 # what happens when strHH is 24? + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "time" and int(word) < 100: + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "minut": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "sekund": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "time": + strHH = word + used += 1 + isTime = True + if wordNextNext == timeQualifier: + strMM = "" + if wordNextNext[:11] == "eftermiddag": + used += 1 + remainder = "pm" + elif wordNextNext == "om" and wordNextNextNext == \ + "eftermiddagen": + used += 2 + remainder = "pm" + elif wordNextNext[:5] == "aften": + used += 1 + remainder = "pm" + elif wordNextNext == "om" and wordNextNextNext == \ + "aftenen": + used += 2 + remainder = "pm" + elif wordNextNext[:6] == "morgen": + used += 1 + remainder = "am" + elif wordNextNext == "om" and wordNextNextNext == \ + "morgenen": + used += 2 + remainder = "am" + elif wordNextNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif is_numeric(wordNextNext): + strMM = wordNextNext + used += 1 + if wordNextNextNext == timeQualifier: + if wordNextNextNext[:11] == "eftermiddag": + used += 1 + remainder = "pm" + elif wordNextNextNext == "om" and \ + wordNextNextNextNext == \ + "eftermiddagen": + used += 2 + remainder = "pm" + elif wordNextNextNext[:6] == "natten": + used += 1 + remainder = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "natten": + used += 2 + remainder = "pm" + elif wordNextNextNext[:7] == "morgenen": + used += 1 + remainder = "am" + elif wordNextNextNext == "om" and \ + wordNextNextNextNext == "morgenen": + used += 2 + remainder = "am" + elif wordNextNextNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif wordNext == timeQualifier: + strHH = word + strMM = 00 + isTime = True + if wordNext[:10] == "eftermidag": + used += 1 + remainder = "pm" + elif wordNext == "om" and \ + wordNextNext == "eftermiddanen": + used += 2 + remainder = "pm" + elif wordNext[:7] == "aftenen": + used += 1 + remainder = "pm" + elif wordNext == "om" and wordNextNext == "aftenen": + used += 2 + remainder = "pm" + elif wordNext[:7] == "morgenen": + used += 1 + remainder = "am" + elif wordNext == "ao" and wordNextNext == "morgenen": + used += 2 + remainder = "am" + elif wordNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + # if timeQualifier != "": + # military = True + # else: + # isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "tidlig": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "sen": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "og" and words[idx - 1] == "" \ + and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + + return [extractedDate, resultStr] + + +def is_fractional_da(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.lower().startswith("halv"): + return 0.5 + + if input_str.lower() == "trediedel": + return 1.0 / 3 + elif input_str.endswith('del'): + input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" + if input_str.lower() in _DA_NUMBERS: + return 1.0 / (_DA_NUMBERS[input_str.lower()]) + + return False + + +def is_ordinal_da(input_str): + """ + This function takes the given text and checks if it is an ordinal number. + + Args: + input_str (str): the string to check if ordinal + Returns: + (bool) or (float): False if not an ordinal, otherwise the number + corresponding to the ordinal + + ordinals for 1, 3, 7 and 8 are irregular + + only works for ordinals corresponding to the numbers in _DA_NUMBERS + + """ + + lowerstr = input_str.lower() + + if lowerstr.startswith("første"): + return 1 + if lowerstr.startswith("anden"): + return 2 + if lowerstr.startswith("tredie"): + return 3 + if lowerstr.startswith("fjerde"): + return 4 + if lowerstr.startswith("femte"): + return 5 + if lowerstr.startswith("sjette"): + return 6 + if lowerstr.startswith("elfte"): + return 1 + if lowerstr.startswith("tolvfte"): + return 12 + + if lowerstr[-3:] == "nde": + # from 20 suffix is -ste* + lowerstr = lowerstr[:-3] + if lowerstr in _DA_NUMBERS: + return _DA_NUMBERS[lowerstr] + + if lowerstr[-4:] in ["ende"]: + lowerstr = lowerstr[:-4] + if lowerstr in _DA_NUMBERS: + return _DA_NUMBERS[lowerstr] + + if lowerstr[-2:] == "te": # below 20 suffix is -te* + lowerstr = lowerstr[:-2] + if lowerstr in _DA_NUMBERS: + return _DA_NUMBERS[lowerstr] + + return False + + +def normalize_da(text, remove_articles=True): + """ German string normalization """ + + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in ["den", "det"]: + continue + + # Convert numbers into digits, e.g. "two" -> "2" + + if word in _DA_NUMBERS: + word = str(_DA_NUMBERS[word]) + + normalized += " " + word + + return normalized[1:] # strip the initial space + + +def extract_numbers_da(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_da, extract_number_da, + short_scale=short_scale, ordinals=ordinals) + + +class DanishNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py new file mode 100644 index 0000000..95fda48 --- /dev/null +++ b/lingua_franca/lang/parse_de.py @@ -0,0 +1,1025 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + extract_numbers_generic, Normalizer +from lingua_franca.lang.common_data_de import _DE_NUMBERS +from lingua_franca.lang.format_de import pronounce_number_de +from lingua_franca.time import now_local + + +de_numbers = { + 'null': 0, + 'ein': 1, + 'eins': 1, + 'eine': 1, + 'einer': 1, + 'einem': 1, + 'einen': 1, + 'eines': 1, + 'zwei': 2, + 'drei': 3, + 'vier': 4, + 'fünf': 5, + 'sechs': 6, + 'sieben': 7, + 'acht': 8, + 'neun': 9, + 'zehn': 10, + 'elf': 11, + 'zwölf': 12, + 'dreizehn': 13, + 'vierzehn': 14, + 'fünfzehn': 15, + 'sechzehn': 16, + 'siebzehn': 17, + 'achtzehn': 18, + 'neunzehn': 19, + 'zwanzig': 20, + 'einundzwanzig': 21, + 'zweiundzwanzig': 22, + 'dreiundzwanzig': 23, + 'vierundzwanzig': 24, + 'fünfundzwanzig': 25, + 'sechsundzwanzig': 26, + 'siebenundzwanzig': 27, + 'achtundzwanzig': 28, + 'neunundzwanzig': 29, + 'dreißig': 30, + 'einunddreißig': 31, + 'vierzig': 40, + 'fünfzig': 50, + 'sechzig': 60, + 'siebzig': 70, + 'achtzig': 80, + 'neunzig': 90, + 'hundert': 100, + 'zweihundert': 200, + 'dreihundert': 300, + 'vierhundert': 400, + 'fünfhundert': 500, + 'sechshundert': 600, + 'siebenhundert': 700, + 'achthundert': 800, + 'neunhundert': 900, + 'tausend': 1000, + 'million': 1000000 +} + +# TODO: short_scale and ordinals don't do anything here. +# The parameters are present in the function signature for API compatibility +# reasons. + + +def extract_duration_de(text): + """ + Convert an german phrase into a number of seconds + Convert things like: + "10 Minuten" + "3 Tage 8 Stunden 10 Minuten und 49 Sekunden" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + text = text.lower() + # die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben + time_units = { + 'microseconds': 'mikrosekunden', + 'milliseconds': 'millisekunden', + 'seconds': 'sekunden', + 'minutes': 'minuten', + 'hours': 'stunden', + 'days': 'tage', + 'weeks': 'wochen' + } + + # Einzahl und Mehrzahl + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" + + # TODO Einstiegspunkt für Text-zu-Zahlen Konversion + #text = _convert_words_to_numbers_de(text) + + for (unit_en, unit_de) in time_units.items(): + unit_pattern = pattern.format( + unit=unit_de[:-1]) # remove 'n'/'e' from unit + time_units[unit_en] = 0 + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_number_de(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + + undefined articles cannot be suppressed in German: + 'ein Pferd' means 'one horse' and 'a horse' + + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + text = text.lower() + aWords = text.split() + aWords = [word for word in aWords if + word not in ["der", "die", "das", "des", "den", "dem"]] + and_pass = False + valPreAnd = False + val = False + count = 0 + while count < len(aWords): + word = aWords[count] + if is_numeric(word): + # if word.isdigit(): # doesn't work with decimals + val = float(word) + elif is_fractional_de(word): + val = is_fractional_de(word) + elif is_ordinal_de(word): + val = is_ordinal_de(word) + else: + if word in _DE_NUMBERS: + val = _DE_NUMBERS[word] + if count < (len(aWords) - 1): + wordNext = aWords[count + 1] + else: + wordNext = "" + valNext = is_fractional_de(wordNext) + + if valNext: + val = val * valNext + aWords[count + 1] = "" + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + elif and_pass: + # added to value, quit here + val = valPreAnd + break + else: + count += 1 + continue + + aWords[count] = "" + + if and_pass: + aWords[count - 1] = '' # remove "and" + val += valPreAnd + elif count + 1 < len(aWords) and aWords[count + 1] == 'und': + and_pass = True + valPreAnd = val + val = False + count += 2 + continue + elif count + 2 < len(aWords) and aWords[count + 2] == 'und': + and_pass = True + valPreAnd = val + val = False + count += 3 + continue + + break + + return val or False + + +def extract_datetime_de(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation + and capitalization among other things. + + 'am' is a preposition, so cannot currently be used + for 12 hour date format + """ + + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' der ', ' ').replace(' den ', ' ').replace(' an ', + ' ').replace( + ' am ', ' ') \ + .replace(' auf ', ' ').replace(' um ', ' ') + wordList = s.split() + + for idx, word in enumerate(wordList): + if is_ordinal_de(word) is not False: + word = str(is_ordinal_de(word)) + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', + 'nachmittag', 'nachmittags', 'abend', 'abends', + 'nachts'] + markers = ['in', 'am', 'gegen', 'bis', 'für'] + days = ['montag', 'dienstag', 'mittwoch', + 'donnerstag', 'freitag', 'samstag', 'sonntag'] + months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'october', 'november', + 'dezember'] + monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug', + 'sept', 'oct', 'nov', 'dez'] + + validFollowups = days + months + monthsShort + validFollowups.append("heute") + validFollowups.append("morgen") + validFollowups.append("nächste") + validFollowups.append("nächster") + validFollowups.append("nächstes") + validFollowups.append("nächsten") + validFollowups.append("nächstem") + validFollowups.append("letzte") + validFollowups.append("letzter") + validFollowups.append("letztes") + validFollowups.append("letzten") + validFollowups.append("letztem") + validFollowups.append("jetzt") + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + + if word != 'morgen' and word != 'übermorgen': + if word[-2:] == "en": + word = word[:-2] # remove en + if word != 'heute': + if word[-1:] == "e": + word = word[:-1] # remove plural for most nouns + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "heute" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag and wordPrev != "am" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen + dayOffset = 1 + used += 1 + elif word == "übermorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "tag" or word == "tage": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "woch" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev[:5] == "letzt": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "monat" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:5] == "letzt": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "jahr" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext == "morgen": # morgen means morning if preceded by + # the day of the week + words[idx + 1] = "früh" + if wordPrev[:6] == "nächst": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev[:5] == "letzt": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + + if ( + word == "von" or word == "nach" or word == "ab") and wordNext \ + in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgen" and wordPrev != "am" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen: + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext[:6] == "nächst": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext[:5] == "letzt": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1].startswith("diese"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word[:6] == "mittag": + hrAbs = 12 + used += 1 + elif word[:11] == "mitternacht": + hrAbs = 0 + used += 1 + elif word == "morgens" or ( + wordPrev == "am" and word == "morgen") or word == "früh": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word[:10] == "nachmittag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word[:5] == "abend": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif word == "stunde" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev[:4] == "halb": + minOffset = 30 + elif wordPrev == "viertel": + minOffset = 15 + elif wordPrev == "dreiviertel": + minOffset = 45 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "abends": + remainder = "pm" + used += 1 + elif wordNext == "am" and wordNextNext == "morgen": + remainder = "am" + used += 2 + elif wordNext == "am" and wordNextNext == "nachmittag": + remainder = "pm" + used += 2 + elif wordNext == "am" and wordNextNext == "abend": + remainder = "pm" + used += 2 + elif wordNext == "morgens": + remainder = "am" + used += 1 + elif wordNext == "nachmittags": + remainder = "pm" + used += 1 + elif wordNext == "abends": + remainder = "pm" + used += 1 + elif wordNext == "heute" and wordNextNext == "morgen": + remainder = "am" + used = 2 + elif wordNext == "heute" and wordNextNext == "nachmittag": + remainder = "pm" + used = 2 + elif wordNext == "heute" and wordNextNext == "abend": + remainder = "pm" + used = 2 + elif wordNext == "nachts": + if strHH > 4: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "abends" or + timeQualifier == "nachmittags"): + strHH += 12 # what happens when strHH is 24? + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "stund" and int(word) < 100: + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "minut": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "sekund": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "uhr": + strHH = word + used += 1 + isTime = True + if wordNextNext == timeQualifier: + strMM = "" + if wordNextNext[:10] == "nachmittag": + used += 1 + remainder = "pm" + elif wordNextNext == "am" and wordNextNextNext == \ + "nachmittag": + used += 2 + remainder = "pm" + elif wordNextNext[:5] == "abend": + used += 1 + remainder = "pm" + elif wordNextNext == "am" and wordNextNextNext == \ + "abend": + used += 2 + remainder = "pm" + elif wordNextNext[:7] == "morgens": + used += 1 + remainder = "am" + elif wordNextNext == "am" and wordNextNextNext == \ + "morgen": + used += 2 + remainder = "am" + elif wordNextNext == "nachts": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif is_numeric(wordNextNext): + strMM = wordNextNext + used += 1 + if wordNextNextNext == timeQualifier: + if wordNextNextNext[:10] == "nachmittag": + used += 1 + remainder = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "nachmittag": + used += 2 + remainder = "pm" + elif wordNextNextNext[:5] == "abend": + used += 1 + remainder = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "abend": + used += 2 + remainder = "pm" + elif wordNextNextNext[:7] == "morgens": + used += 1 + remainder = "am" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "morgen": + used += 2 + remainder = "am" + elif wordNextNextNext == "nachts": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif wordNext == timeQualifier: + strHH = word + strMM = 00 + isTime = True + if wordNext[:10] == "nachmittag": + used += 1 + remainder = "pm" + elif wordNext == "am" and wordNextNext == "nachmittag": + used += 2 + remainder = "pm" + elif wordNext[:5] == "abend": + used += 1 + remainder = "pm" + elif wordNext == "am" and wordNextNext == "abend": + used += 2 + remainder = "pm" + elif wordNext[:7] == "morgens": + used += 1 + remainder = "am" + elif wordNext == "am" and wordNextNext == "morgen": + used += 2 + remainder = "am" + elif wordNext == "nachts": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + # if timeQualifier != "": + # military = True + # else: + # isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "Uhr": + words[words.index(wordPrev)] = "" + + if wordPrev == "früh": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "spät": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "und" and words[idx - 1] == "" \ + and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + + return [extractedDate, resultStr] + + +def is_fractional_de(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.lower().startswith("halb"): + return 0.5 + + if input_str.lower() == "drittel": + return 1.0 / 3 + elif input_str.endswith('tel'): + if input_str.endswith('stel'): + input_str = input_str[:len(input_str) - 4] # e.g. "hundertstel" + else: + input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" + if input_str.lower() in _DE_NUMBERS: + return 1.0 / (_DE_NUMBERS[input_str.lower()]) + + return False + + +def is_ordinal_de(input_str): + """ + This function takes the given text and checks if it is an ordinal number. + + Args: + input_str (str): the string to check if ordinal + Returns: + (bool) or (float): False if not an ordinal, otherwise the number + corresponding to the ordinal + + ordinals for 1, 3, 7 and 8 are irregular + + only works for ordinals corresponding to the numbers in _DE_NUMBERS + + """ + + lowerstr = input_str.lower() + + if lowerstr.startswith("erste"): + return 1 + if lowerstr.startswith("dritte"): + return 3 + if lowerstr.startswith("siebte"): + return 7 + if lowerstr.startswith("achte"): + return 8 + + if lowerstr[-3:] == "ste": # from 20 suffix is -ste* + lowerstr = lowerstr[:-3] + if lowerstr in _DE_NUMBERS: + return _DE_NUMBERS[lowerstr] + + if lowerstr[-4:] in ["ster", "stes", "sten", "stem"]: + lowerstr = lowerstr[:-4] + if lowerstr in _DE_NUMBERS: + return _DE_NUMBERS[lowerstr] + + if lowerstr[-2:] == "te": # below 20 suffix is -te* + lowerstr = lowerstr[:-2] + if lowerstr in _DE_NUMBERS: + return _DE_NUMBERS[lowerstr] + + if lowerstr[-3:] in ["ter", "tes", "ten", "tem"]: + lowerstr = lowerstr[:-3] + if lowerstr in _DE_NUMBERS: + return _DE_NUMBERS[lowerstr] + + return False + + +def normalize_de(text, remove_articles=True): + """ German string normalization """ + # TODO return GermanNormalizer().normalize(text, remove_articles) + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in ["der", "die", "das", "des", "den", + "dem"]: + continue + + # Expand common contractions, e.g. "isn't" -> "is not" + contraction = ["net", "nett"] + if word in contraction: + expansion = ["nicht", "nicht"] + word = expansion[contraction.index(word)] + + # Convert numbers into digits, e.g. "two" -> "2" + + if word in _DE_NUMBERS: + word = str(_DE_NUMBERS[word]) + + normalized += " " + word + + return normalized[1:] # strip the initial space + + +def extract_numbers_de(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_de, extract_number_de, + short_scale=short_scale, ordinals=ordinals) + + +class GermanNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py new file mode 100644 index 0000000..0a8b8e1 --- /dev/null +++ b/lingua_franca/lang/parse_en.py @@ -0,0 +1,1485 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_en import _ARTICLES_EN, _NUM_STRING_EN, \ + _LONG_ORDINAL_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, \ + _NEGATIVES_EN, _SUMS_EN, _MULTIPLIES_LONG_SCALE_EN, \ + _MULTIPLIES_SHORT_SCALE_EN, _FRACTION_MARKER_EN, _DECIMAL_MARKER_EN, \ + _STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \ + _FRACTION_STRING_EN, _generate_plurals_en, _SPOKEN_EXTRA_NUM_EN + +import re +import json +from lingua_franca.internal import resolve_resource_file + + +def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_en(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_en(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_en(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_en(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_en_helper(tokens, short_scale, + ordinals, fractional_numbers) + while tokens and tokens[0].word in _ARTICLES_EN: + tokens.pop(0) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_en_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_en(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_en(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_en(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_en(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER_EN: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_en(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_en(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_en(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extractnumber_en, it also depends on + extractnumber_en, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER_EN: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_en(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_en(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_en(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data_en(short_scale, speech=ordinals is not None) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word.lower() + if word in _ARTICLES_EN or word in _NEGATIVES_EN: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" + next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" + + if is_numeric(word[:-2]) and \ + (word.endswith("st") or word.endswith("nd") or + word.endswith("rd") or word.endswith("th")): + + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + word = word[:-2] + + # handle nth one + if next_word == "one": + # would return 1 instead otherwise + tokens[idx + 1] = Token("", idx) + next_word = "" + + # TODO replaces the wall of "and" and "or" with all() or any() as + # appropriate, the whole codebase should be checked for this pattern + if word not in string_num_scale and \ + word not in _STRING_NUM_EN and \ + word not in _SUMS_EN and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not is_fractional_en(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + + if number_words and not all([w.lower() in _ARTICLES_EN | + _NEGATIVES_EN for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SUMS_EN \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES_EN \ + and prev_word not in _ARTICLES_EN: + number_words = [token] + + elif prev_word in _SUMS_EN and word in _SUMS_EN: + number_words = [token] + elif ordinals is None and \ + (word in string_num_ordinal or word in _SPOKEN_EXTRA_NUM_EN): + # flagged to ignore this token + continue + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_EN: + val = _STRING_NUM_EN.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in _SUMS_EN and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # half cup + if val is False and \ + not (ordinals is None and word in string_num_ordinal): + val = is_fractional_en(word, short_scale=short_scale, + spoken=ordinals is not None) + + current_val = val + + # 2 fifths + if ordinals is False: + next_val = is_fractional_en(next_word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES_EN: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + + else: + if current_val and all([ + prev_word in _SUMS_EN, + word not in _SUMS_EN, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx+1:]: + if other_token.word.lower() in multiplies: + if string_num_scale[other_token.word.lower()] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data_en(short_scale, speech=True): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale (bool): + speech (bool): consider extra words (_SPOKEN_EXTRA_NUM_EN) to be numbers + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_EN if short_scale \ + else _MULTIPLIES_LONG_SCALE_EN + + string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN if short_scale \ + else _STRING_LONG_ORDINAL_EN + + string_num_scale_en = _SHORT_SCALE_EN if short_scale else _LONG_SCALE_EN + string_num_scale_en = invert_dict(string_num_scale_en) + string_num_scale_en.update(_generate_plurals_en(string_num_scale_en)) + + if speech: + string_num_scale_en.update(_SPOKEN_EXTRA_NUM_EN) + return multiplies, string_num_ordinal_en, string_num_scale_en + + +def extract_number_en(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_en(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_en(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}s?" + text = _convert_words_to_numbers_en(text) + + for unit_en in time_units: + unit_pattern = pattern.format(unit=unit_en[:-1]) # remove 's' from unit + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_en(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # normalize and lowercase utt (replaces words with numbers) + s = _convert_words_to_numbers_en(s, ordinals=None) + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ + .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ + .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ + .replace("oclock", "o'clock").replace("couple", "2") \ + .replace("centuries", "century").replace("decades", "decade") \ + .replace("millenniums", "millennium") + + wordList = s.split() + for idx, word in enumerate(wordList): + word = word.replace("'s", "") + + ordinals = ["rd", "st", "nd", "th"] + if word[0].isdigit(): + for ordinal in ordinals: + # "second" is the only case we should not do this + if ordinal in word and "second" not in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if not anchorDate: + anchorDate = now_local() + + if text == "": + return None + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['morning'] + timeQualifiersPM = ['afternoon', 'evening', 'night', 'tonight'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of', "within"] + days = ['monday', 'tuesday', 'wednesday', + 'thursday', 'friday', 'saturday', 'sunday'] + months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + recur_markers = days + [d + 's' for d in days] + ['weekend', 'weekday', + 'weekends', 'weekdays'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', + 'sept', 'oct', 'nov', 'dec'] + year_multiples = ["decade", "century", "millennium"] + day_multiples = ["weeks", "months", "years"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + if word == "ago" and dayOffset: + dayOffset = - dayOffset + used += 1 + if word == "now" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_en(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "decade": + yearOffset = multiplier * 10 + elif wordNext == "century": + yearOffset = multiplier * 100 + elif wordNext == "millennium": + yearOffset = multiplier * 1000 + # couple of + elif word == "2" and wordNext == "of" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "decade": + yearOffset = multiplier * 10 + elif wordNextNext == "century": + yearOffset = multiplier * 100 + elif wordNextNext == "millennium": + yearOffset = multiplier * 1000 + elif word == "2" and wordNext == "of" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "years": + yearOffset = multiplier + elif wordNextNext == "months": + monthOffset = multiplier + elif wordNextNext == "weeks": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "today" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "tomorrow" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "day" and wordNext == "before" and wordNextNext == "yesterday" and not fromFlag: + dayOffset = -2 + used += 3 + elif word == "before" and wordNext == "yesterday" and not fromFlag: + dayOffset = -2 + used += 2 + elif word == "yesterday" and not fromFlag: + dayOffset = -1 + used += 1 + elif (word == "day" and + wordNext == "after" and + wordNextNext == "tomorrow" and + not fromFlag and + (not wordPrev or not wordPrev[0].isdigit())): + dayOffset = 2 + used = 3 + if wordPrev == "the": + start -= 1 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "day": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "week" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "next": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "last": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "month" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "next": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "last": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "year" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "next": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "last": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "next": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "last": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # if no date indicators found, it may not be the month of May + # may "i/we" ... + # "... may be" + elif word == 'may' and wordNext in ['i', 'we', 'be']: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("today") + validFollowups.append("tomorrow") + validFollowups.append("yesterday") + validFollowups.append("next") + validFollowups.append("last") + validFollowups.append("now") + validFollowups.append("this") + if (word == "from" or word == "after") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "tomorrow": + dayOffset += 1 + elif wordNext == "yesterday": + dayOffset -= 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "next": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "last": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "this": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "noon": + hrAbs = 12 + used += 1 + elif word == "midnight": + hrAbs = 0 + used += 1 + elif word == "morning": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "afternoon": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "evening": + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "tonight" or word == "night": + if hrAbs is None: + hrAbs = 22 + # used += 1 ## NOTE this breaks other tests, TODO refactor me! + + # couple of time_unit + elif word == "2" and wordNext == "of" and \ + wordNextNext in ["hours", "minutes", "seconds"]: + used += 3 + if wordNextNext == "hours": + hrOffset = 2 + elif wordNextNext == "minutes": + minOffset = 2 + elif wordNextNext == "seconds": + secOffset = 2 + # parse half an hour, quarter hour + elif word == "hour" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "half": + minOffset = 30 + elif wordPrev == "quarter": + minOffset = 15 + elif wordPrevPrev == "quarter": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + words[idx - 2] = "" + elif wordPrev == "within": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "this": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minute" and wordPrev == "in": + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "second" and wordPrev == "in": + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "tonight" or wordNextNext == "tonight" or \ + wordPrev == "tonight" or wordPrevPrev == "tonight" or \ + wordNextNextNext == "tonight": + remainder = "pm" + used += 1 + if wordPrev == "tonight": + words[idx - 1] = "" + if wordPrevPrev == "tonight": + words[idx - 2] = "" + if wordNextNext == "tonight": + used += 1 + if wordNextNextNext == "tonight": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "morning": + remainder = "am" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "afternoon": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "evening": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "morning": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "afternoon": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "evening": + remainder = "pm" + used += 2 + elif wordNext == "this" and wordNextNext == "morning": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "this" and wordNextNext == "afternoon": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "this" and wordNextNext == "evening": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "at" and wordNextNext == "night": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + int(strNum) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hours": + used += 1 + elif ( + (wordNext == "hours" or wordNext == "hour" or + remainder == "hours" or remainder == "hour") and + word[0] != '0' and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minutes" or wordNext == "minute" or \ + remainder == "minutes" or remainder == "minute": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconds" or wordNext == "second" \ + or remainder == "seconds" or remainder == "second": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hours" or wordNext == "hour" or \ + remainder == "hours" or remainder == "hour": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "hours" or + wordNextNext == "hour" or + remainder == "hours" or remainder == "hour"): + used += 1 + elif ( + wordNext == "" or wordNext == "o'clock" or + ( + wordNext == "in" and + ( + wordNextNext == "the" or + wordNextNext == timeQualifier + ) + ) or wordNext == 'tonight' or + wordNextNext == 'tonight'): + + strHH = strNum + strMM = "00" + if wordNext == "o'clock": + used += 1 + + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm', 'hours', 'minutes', + "second", "seconds", + "hour", "minute"] and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "o" or wordPrev == "oh": + words[words.index(wordPrev)] = "" + + if wordPrev == "early": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "late": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "this": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "this": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "and" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def is_fractional_en(input_str, short_scale=True, spoken=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + spoken (bool): consider "half", "quarter", "whole" a fraction + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + if short_scale: + for num in _SHORT_ORDINAL_EN: + if num > 2: + fracts[_SHORT_ORDINAL_EN[num]] = num + else: + for num in _LONG_ORDINAL_EN: + if num > 2: + fracts[_LONG_ORDINAL_EN[num]] = num + + if input_str.lower() in fracts and spoken: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_en(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_en(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +class EnglishNormalizer(Normalizer): + with open(resolve_resource_file("text/en-us/normalize.json")) as f: + _default_config = json.load(f) + + def numbers_to_digits(self, utterance): + return _convert_words_to_numbers_en(utterance, ordinals=None) + + +def normalize_en(text, remove_articles=True): + """ English string normalization """ + return EnglishNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py new file mode 100644 index 0000000..0a810cc --- /dev/null +++ b/lingua_franca/lang/parse_es.py @@ -0,0 +1,1110 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime +from dateutil.relativedelta import relativedelta + +from lingua_franca.time import now_local +from lingua_franca.lang.format_es import pronounce_number_es +from lingua_franca.lang.parse_common import * +from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES + + +def is_fractional_es(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + text (str): the string to check if fractional + + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = {"medio": 2, "media": 2, "tercio": 3, "cuarto": 4, + "cuarta": 4, "quinto": 5, "quinta": 5, "sexto": 6, "sexta": 6, + "séptimo": 7, "séptima": 7, "octavo": 8, "octava": 8, + "noveno": 9, "novena": 9, "décimo": 10, "décima": 10, + "onceavo": 11, "onceava": 11, "doceavo": 12, "doceava": 12} + + if input_str.lower() in aFrac: + return 1.0 / aFrac[input_str] + if (input_str == "vigésimo" or input_str == "vigésima"): + return 1.0 / 20 + if (input_str == "trigésimo" or input_str == "trigésima"): + return 1.0 / 30 + if (input_str == "centésimo" or input_str == "centésima"): + return 1.0 / 100 + if (input_str == "milésimo" or input_str == "milésima"): + return 1.0 / 1000 + return False + + +def extract_number_es(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + # + # Returns incorrect output on certain fractional phrases like, "cuarto de dos" + # TODO: numbers greater than 999999 + aWords = text.lower().split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in _STRING_NUM_ES: + val = _STRING_NUM_ES[word] + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif is_fractional_es(word): + if not result: + result = 1 + result = result * is_fractional_es(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + if next_word != "avos": + result = val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["y"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extract_number_es(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + for _ in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extract_number_es(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["punto", "coma", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extract_number_es(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result or False + + +def _es_number_parse(words, i): + # TODO Not parsing 'cero' + + def es_cte(i, s): + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def es_number_word(i, mi, ma): + if i < len(words): + v = _STRING_NUM_ES.get(words[i]) + if v and v >= mi and v <= ma: + return v, i + 1 + return None + + def es_number_1_99(i): + r1 = es_number_word(i, 1, 29) + if r1: + return r1 + + r1 = es_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "y") + if r2: + i2 = r2[1] + r3 = es_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1 + v3, i3 + return r1 + return None + + def es_number_1_999(i): + # [2-9]cientos [1-99]? + r1 = es_number_word(i, 100, 900) + if r1: + v1, i1 = r1 + r2 = es_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1 + v2, i2 + else: + return r1 + + # [1-99] + r1 = es_number_1_99(i) + if r1: + return r1 + + return None + + def es_number(i): + # check for cero + r1 = es_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = es_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "mil") + if r2: + i2 = r2[1] + r3 = es_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1 * 1000 + v3, i3 + else: + return v1 * 1000, i2 + else: + return r1 + return None + + return es_number(i) + + +def extract_numbers_es(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_es, + extract_number_es, short_scale=short_scale, + ordinals=ordinals) + + +def normalize_es(text, remove_articles=True): + """ Spanish string normalization """ + # TODO return SpanishNormalizer().normalize(text, remove_articles) + words = text.split() # this also removed extra spaces + + normalized = "" + i = 0 + while i < len(words): + word = words[i] + + if remove_articles and word in _ARTICLES_ES: + i += 1 + continue + + # Convert numbers into digits + r = _es_number_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue + + normalized += " " + word + i += 1 + + return normalized[1:] # strip the initial space + + +# TODO MycroftAI/mycroft-core#2348 +def extract_datetime_es(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["entre", "la", "del", "al", "el", "de", + "para", "una", "cualquier", "a", + "e'", "esta", "este"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "á", + "a").replace( + "é", + "e").replace( + "ó", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonyms and equivalents, "tomorrow early = tomorrow morning + synonyms = {"mañana": ["amanecer", "temprano", "muy temprano"], + "tarde": ["media tarde", "atardecer"], + "noche": ["anochecer", "tarde"]} + for syn in synonyms: + for word in synonyms[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["mañanas", "tardes", "noches", "días", "semanas", + "años", "minutos", "segundos", "las", "los", "siguientes", + "próximas", "próximos", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + if anchorDate is None: + anchorDate = now_local() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['mañana', 'tarde', 'noche'] + time_indicators = ["en", "la", "al", "por", "pasados", + "pasadas", "día", "hora"] + days = ['lunes', 'martes', 'miércoles', + 'jueves', 'viernes', 'sábado', 'domingo'] + months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', + 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', + 'diciembre'] + monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', + 'sep', 'oct', 'nov', 'dic'] + nexts = ["siguiente", "próximo", "próxima"] + suffix_nexts = ["siguientes", "subsecuentes"] + lasts = ["último", "última"] + suffix_lasts = ["pasada", "pasado", "anterior", "antes"] + nxts = ["después", "siguiente", "próximo", "próxima"] + prevs = ["antes", "previa", "previo", "anterior"] + froms = ["desde", "en", "para", "después de", "por", "próximo", + "próxima", "de"] + thises = ["este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoy" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "mañana" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ayer" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteayer" or + (word == "ante" and wordNext == "ayer")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ayer": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ayer" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "ante anteayer" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "pasado" and wordNext == "mañana" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "ante" and wordNext == "ayer" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "día": + if wordNext == "pasado" or wordNext == "ante": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "año" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "siguiente": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "pasado": + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "siguiente": + # dayOffset += 7 + used += 1 + elif wordNext == "pasado": + # dayOffset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 mayo + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # mayo 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia mayo + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # mayo dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoy") + validFollowups.append("mañana") + validFollowups.append("ayer") + validFollowups.append("anteayer") + validFollowups.append("ahora") + validFollowups.append("ya") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "mañana" and wordNext == "ayer") and not ( + word == "pasado" or word == "antes"): + used = 2 + fromFlag = True + if wordNext == "mañana" and word != "pasado": + dayOffset += 1 + elif wordNext == "ayer": + dayOffset -= 1 + elif wordNext == "anteayer": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ayer": + dayOffset -= 2 + elif (wordNext == "ante" and wordNext == "ante" and + wordNextNextNext == "ayer"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + # if wordNextNext == "feira": + # used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + # if wordNextNextNext == "feira": + # used += 1 + if wordNext in months: + used -= 1 + if used > 0: + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "medio" and wordNext == "día": + hrAbs = 12 + used += 2 + elif word == "media" and wordNext == "noche": + hrAbs = 0 + used += 2 + elif word == "mañana": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "media" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "tarde" and wordNext == "noche": + if not hrAbs: + hrAbs = 20 + used += 2 + elif word == "media" and wordNext == "mañana": + if not hrAbs: + hrAbs = 10 + used += 2 + # elif word == "fim" and wordNext == "tarde": + # if not hrAbs: + # hrAbs = 19 + # used += 2 + # elif word == "fim" and wordNext == "manha": + # if not hrAbs: + # hrAbs = 11 + # used += 2 + elif word == "madrugada": + if not hrAbs: + hrAbs = 1 + used += 2 + elif word == "noche": + if not hrAbs: + hrAbs = 21 + used += 1 + # parse half an hour, quarter hour + elif (word == "hora" and + (wordPrev in time_indicators or wordPrevPrev in + time_indicators)): + if wordPrev == "media": + minOffset = 30 + elif wordPrev == "cuarto": + minOffset = 15 + elif wordPrevPrev == "cuarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "mañana" or wordNext == "madrugada": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noche": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "mañana": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noche": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "mañana" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "mañana"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + # wordPrev == "o" or + # wordPrev == "oh" or + wordPrev == "cero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punto"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "mañana": + remainder = "am" + used += 1 + elif wordNextNextNext == "noche": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + # resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def get_gender_es(word, context=""): + """ Guess the gender of a word + + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + # Next rules are imprecise and incompleted, but is a good starting point. + # For more detailed explanation, see + # http://www.wikilengua.org/index.php/Género_gramatical + word = word.rstrip("s") + gender = False + words = context.split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_es(previous) + break + if not gender: + if word[-1] == "a": + gender = "f" + if word[-1] == "o" or word[-1] == "e": + gender = "m" + return gender + + +class SpanishNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py new file mode 100644 index 0000000..ad856b9 --- /dev/null +++ b/lingua_franca/lang/parse_fa.py @@ -0,0 +1,381 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +from datetime import timedelta + +from lingua_franca.internal import resolve_resource_file +from lingua_franca.lang.common_data_fa import (_FARSI_BIG, _FARSI_HUNDREDS, + _FARSI_ONES, _FARSI_TENS, + _FORMAL_VARIANT) +from lingua_franca.lang.parse_common import Normalizer +from lingua_franca.time import now_local + + +def _is_number(s): + try: + float(s) + return True + except ValueError: + return False + +def _parse_sentence(text): + for key, value in _FORMAL_VARIANT.items(): + text = text.replace(key, value) + ar = text.split() + result = [] + current_number = 0 + current_words = [] + s = 0 + step = 10 + mode = 'init' + def finish_num(): + nonlocal current_number + nonlocal s + nonlocal result + nonlocal mode + nonlocal current_words + current_number += s + if current_number != 0: + result.append((current_number, current_words)) + s = 0 + current_number = 0 + current_words = [] + mode = 'init' + for x in ar: + if x == "و": + if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': + mode += '_va' + current_words.append(x) + elif mode == 'num': + current_words.append(x) + else: + finish_num() + result.append(x) + elif x == "نیم": + current_words.append(x) + current_number += 0.5 + finish_num() + elif x in _FARSI_ONES: + t = _FARSI_ONES.index(x) + if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': + if not(t < 10 and mode == 'num_ten_va'): + finish_num() + current_words.append(x) + s += t + mode = 'num_one' + elif x in _FARSI_TENS: + if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': + finish_num() + current_words.append(x) + s += _FARSI_TENS.index(x)*10 + mode = 'num_ten' + elif x in _FARSI_HUNDREDS: + if mode != 'init' and mode != 'num': + finish_num() + current_words.append(x) + s += _FARSI_HUNDREDS.index(x)*100 + mode = 'num_hundred' + elif x in _FARSI_BIG: + current_words.append(x) + d = _FARSI_BIG.index(x) + if mode == 'init' and d == 1: + s = 1 + s *= 10**(3*d) + current_number += s + s = 0 + mode = 'num' + elif _is_number(x): + current_words.append(x) + current_number = float(x) + finish_num() + else: + finish_num() + result.append(x) + if mode[:3] == 'num': + finish_num() + return result + + +_time_units = { + 'ثانیه': timedelta(seconds=1), + 'دقیقه': timedelta(minutes=1), + 'ساعت': timedelta(hours=1), +} + +_date_units = { + 'روز': timedelta(days=1), + 'هفته': timedelta(weeks=1), +} + +def extract_duration_fa(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + remainder = [] + ar = _parse_sentence(text) + current_number = None + result = timedelta(0) + for x in ar: + if x == "و": + continue + elif type(x) == tuple: + current_number = x + elif x in _time_units: + result += _time_units[x] * current_number[0] + current_number = None + elif x in _date_units: + result += _date_units[x] * current_number[0] + current_number = None + else: + if current_number: + remainder.extend(current_number[1]) + remainder.append(x) + current_number = None + return (result, " ".join(remainder)) + + +def extract_datetime_fa(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + if text == "": + return None + text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ + .replace('?', '').replace("پس فردا", "پسفردا") \ + .replace('یک شنبه', 'یکشنبه') \ + .replace('دو شنبه', 'دوشنبه') \ + .replace('سه شنبه', 'سهشنبه') \ + .replace('چهار شنبه', 'چهارشنبه') \ + .replace('پنج شنبه', 'پنجشنبه') \ + .replace('بعد از ظهر', 'بعدازظهر') \ + + + if not anchorDate: + anchorDate = now_local() + today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) + today_weekday = int(anchorDate.strftime("%w")) + weekday_names = [ + 'دوشنبه', + 'سهشنبه', + 'چهارشنبه', + 'پنجشنبه', + 'جمعه', + 'شنبه', + 'یکشنبه', + ] + daysDict = { + 'پریروز': today + timedelta(days= -2), + 'دیروز': today + timedelta(days= -1), + 'امروز': today, + 'فردا': today + timedelta(days= 1), + 'پسفردا': today + timedelta(days= 2), + } + timesDict = { + 'صبح': timedelta(hours=8), + 'بعدازظهر': timedelta(hours=15), + } + exactDict = { + 'الان': anchorDate, + } + nextWords = ["بعد", "دیگه"] + prevWords = ["پیش", "قبل"] + ar = _parse_sentence(text) + mode = 'none' + number_seen = None + delta_seen = timedelta(0) + remainder = [] + result = None + for x in ar: + handled = 1 + if mode == 'finished': + remainder.append(x) + elif x == 'و' and mode[:5] == 'delta': + pass + elif type(x) == tuple: + number_seen = x + elif x in weekday_names: + dayOffset = (weekday_names.index(x) + 1) - today_weekday + if dayOffset < 0: + dayOffset += 7 + result = today + timedelta(days=dayOffset) + mode = 'time' + elif x in exactDict: + result = exactDict[x] + mode = 'finished' + elif x in daysDict: + result = daysDict[x] + mode = 'time' + elif x in timesDict and mode == 'time': + result += timesDict[x] + mode = 'finish' + elif x in _date_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _date_units[x] * k + if mode != 'delta_time': + mode = 'delta_date' + elif x in _time_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _time_units[x] * k + mode = 'delta_time' + elif x in nextWords or x in prevWords: + # Give up instead of incorrect result + if mode == 'time': + return None + sign = 1 if x in nextWords else -1 + if mode == 'delta_date': + result = today + delta_seen + mode = 'time' + elif mode == 'delta_time': + result = anchorDate + delta_seen + mode = 'finished' + else: + handled = 0 + else: + handled = 0 + if handled == 1: + continue + if number_seen: + remainder.extend(number_seen[1]) + number_seen = None + remainder.append(x) + return (result, " ".join(remainder)) + +def is_fractional_fa(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + if short_scale: + for num in _SHORT_ORDINAL_FA: + if num > 2: + fracts[_SHORT_ORDINAL_FA[num]] = num + else: + for num in _LONG_ORDINAL_FA: + if num > 2: + fracts[_LONG_ORDINAL_FA[num]] = num + + if input_str.lower() in fracts: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_fa(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + + ar = _parse_sentence(text) + result = [] + for x in ar: + if type(x) == tuple: + result.append(x[0]) + return result + + +def extract_number_fa(text, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + x = extract_numbers_fa(text, ordinals=ordinals) + if (len(x) == 0): + return False + return x[0] diff --git a/lingua_franca/lang/parse_fr.py b/lingua_franca/lang/parse_fr.py new file mode 100644 index 0000000..1956182 --- /dev/null +++ b/lingua_franca/lang/parse_fr.py @@ -0,0 +1,1090 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re +from dateutil.tz import gettz +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + extract_numbers_generic, Normalizer +from lingua_franca.lang.format_fr import pronounce_number_fr +from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ + _ORDINAL_ENDINGS_FR +from lingua_franca.time import now_local + + +def extract_duration_fr(text): + """ + Convert an french phrase into a number of seconds + Convert things like: + "10 minutes" + "3 jours 8 heures 10 minutes und 49 secondes" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + text = normalize_fr(text) + + time_units = { + 'microseconds': 'microsecondes', + 'milliseconds': 'millisecondes', + 'seconds': 'secondes', + 'minutes': 'minutes', + 'hours': 'heures', + 'days': 'jours', + 'weeks': 'semaines' + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[s]?(\s+|,|$)" + + for (unit_en, unit_fr) in time_units.items(): + unit_pattern = pattern.format(unit=unit_fr[:-1]) # remove 's' from unit + time_units[unit_en] = 0 + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + +def _number_parse_fr(words, i): + """ Parses a list of words to find a number + Takes in a list of words (strings without whitespace) and + extracts a number that starts at the given index. + Args: + words (array): the list to extract a number from + i (int): the index in words where to look for the number + Returns: + tuple with number, index of next word after the number. + + Returns None if no number was found. + """ + + def cte_fr(i, s): + # Check if string s is equal to words[i]. + # If it is return tuple with s, index of next word. + # If it is not return None. + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def number_word_fr(i, mi, ma): + # Check if words[i] is a number in _NUMBERS_FR between mi and ma. + # If it is return tuple with number, index of next word. + # If it is not return None. + if i < len(words): + val = _NUMBERS_FR.get(words[i]) + # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000] + if val is not None: + if val >= mi and val <= ma: + return val, i + 1 + else: + return None + # The number may be hyphenated (numbers [17-999]) + splitWord = words[i].split('-') + if len(splitWord) > 1: + val1 = _NUMBERS_FR.get(splitWord[0]) + if val1: + i1 = 0 + val2 = 0 + val3 = 0 + if val1 < 10 and splitWord[1] == "cents": + val1 = val1 * 100 + i1 = 2 + + # For [81-99], e.g. "quatre-vingt-deux" + if len(splitWord) > i1 and splitWord[0] == "quatre" and \ + splitWord[1] == "vingt": + val1 = 80 + i1 += 2 + + # We still found a number + if i1 == 0: + i1 = 1 + + if len(splitWord) > i1: + # For [21,31,41,51,61,71] + if len(splitWord) > i1 + 1 and splitWord[i1] == "et": + val2 = _NUMBERS_FR.get(splitWord[i1 + 1]) + if val2 is not None: + i1 += 2 + # For [77-79],[97-99] e.g. "soixante-dix-sept" + elif splitWord[i1] == "dix" and \ + len(splitWord) > i1 + 1: + val2 = _NUMBERS_FR.get(splitWord[i1 + 1]) + if val2 is not None: + val2 += 10 + i1 += 2 + else: + val2 = _NUMBERS_FR.get(splitWord[i1]) + if val2 is not None: + i1 += 1 + if len(splitWord) > i1: + val3 = _NUMBERS_FR.get(splitWord[i1]) + if val3 is not None: + i1 += 1 + + if val2: + if val3: + val = val1 + val2 + val3 + else: + val = val1 + val2 + else: + return None + if i1 == len(splitWord) and val and ma >= val >= mi: + return val, i + 1 + + return None + + def number_1_99_fr(i): + # Check if words[i] is a number between 1 and 99. + # If it is return tuple with number, index of next word. + # If it is not return None. + + # Is it a number between 1 and 16? + result1 = number_word_fr(i, 1, 16) + if result1: + return result1 + + # Is it a number between 10 and 99? + result1 = number_word_fr(i, 10, 99) + if result1: + val1, i1 = result1 + result2 = cte_fr(i1, "et") + # If the number is not hyphenated [21,31,41,51,61,71] + if result2: + i2 = result2[1] + result3 = number_word_fr(i2, 1, 11) + if result3: + val3, i3 = result3 + return val1 + val3, i3 + return result1 + + # It is not a number + return None + + def number_1_999_fr(i): + # Check if words[i] is a number between 1 and 999. + # If it is return tuple with number, index of next word. + # If it is not return None. + + # Is it 100 ? + result = number_word_fr(i, 100, 100) + + # Is it [200,300,400,500,600,700,800,900]? + if not result: + resultH1 = number_word_fr(i, 2, 9) + if resultH1: + valH1, iH1 = resultH1 + resultH2 = number_word_fr(iH1, 100, 100) + if resultH2: + iH2 = resultH2[1] + result = valH1 * 100, iH2 + + if result: + val1, i1 = result + result2 = number_1_99_fr(i1) + if result2: + val2, i2 = result2 + return val1 + val2, i2 + else: + return result + + # Is it hyphenated? [101-999] + result = number_word_fr(i, 101, 999) + if result: + return result + + # [1-99] + result = number_1_99_fr(i) + if result: + return result + + return None + + def number_1_999999_fr(i): + """ Find a number in a list of words + Checks if words[i] is a number between 1 and 999,999. + + Args: + i (int): the index in words where to look for the number + Returns: + tuple with number, index of next word after the number. + + Returns None if no number was found. + """ + + # check for zero + result1 = number_word_fr(i, 0, 0) + if result1: + return result1 + + # check for [1-999] + result1 = number_1_999_fr(i) + if result1: + val1, i1 = result1 + else: + val1 = 1 + i1 = i + # check for 1000 + result2 = number_word_fr(i1, 1000, 1000) + if result2: + # it's [1000-999000] + i2 = result2[1] + # check again for [1-999] + result3 = number_1_999_fr(i2) + if result3: + val3, i3 = result3 + return val1 * 1000 + val3, i3 + else: + return val1 * 1000, i2 + elif result1: + return result1 + return None + + return number_1_999999_fr(i) + + +def _get_ordinal_fr(word): + """ Get the ordinal number + Takes in a word (string without whitespace) and + extracts the ordinal number. + Args: + word (string): the word to extract the number from + Returns: + number (int) + + Returns None if no ordinal number was found. + """ + if word: + for ordinal in _ORDINAL_ENDINGS_FR: + if word[0].isdigit() and ordinal in word: + result = word.replace(ordinal, "") + if result.isdigit(): + return int(result) + + return None + + +def _number_ordinal_fr(words, i): + """ Find an ordinal number in a list of words + Takes in a list of words (strings without whitespace) and + extracts an ordinal number that starts at the given index. + Args: + words (array): the list to extract a number from + i (int): the index in words where to look for the ordinal number + Returns: + tuple with ordinal number (str), + index of next word after the number (int). + + Returns None if no ordinal number was found. + """ + val1 = None + strOrd = "" + # it's already a digit, normalize to "1er" or "5e" + val1 = _get_ordinal_fr(words[i]) + if val1 is not None: + if val1 == 1: + strOrd = "1er" + else: + strOrd = str(val1) + "e" + return strOrd, i + 1 + + # if it's a big number the beginning should be detected as a number + result = _number_parse_fr(words, i) + if result: + val1, i = result + else: + val1 = 0 + + if i < len(words): + word = words[i] + if word in ["premier", "première"]: + strOrd = "1er" + elif word == "second": + strOrd = "2e" + elif word.endswith("ième"): + val2 = None + word = word[:-4] + # centième + if word == "cent": + if val1: + strOrd = str(val1 * 100) + "e" + else: + strOrd = "100e" + # millième + elif word == "mill": + if val1: + strOrd = str(val1 * 1000) + "e" + else: + strOrd = "1000e" + else: + # "cinquième", "trente-cinquième" + if word.endswith("cinqu"): + word = word[:-1] + # "neuvième", "dix-neuvième" + elif word.endswith("neuv"): + word = word[:-1] + "f" + result = _number_parse_fr([word], 0) + if not result: + # "trentième", "douzième" + word = word + "e" + result = _number_parse_fr([word], 0) + if result: + val2, i = result + if val2 is not None: + strOrd = str(val1 + val2) + "e" + if strOrd: + return strOrd, i + 1 + + return None + + +def extract_number_fr(text, short_scale=True, ordinals=False): + """Takes in a string and extracts a number. + Args: + text (str): the string to extract a number from + Returns: + (str): The number extracted or the original text. + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + # normalize text, keep articles for ordinals versus fractionals + text = normalize_fr(text, False) + # split words by whitespace + aWords = text.split() + count = 0 + result = None + add = False + while count < len(aWords): + val = None + word = aWords[count] + wordNext = "" + wordPrev = "" + if count < (len(aWords) - 1): + wordNext = aWords[count + 1] + if count > 0: + wordPrev = aWords[count - 1] + + if word in _ARTICLES_FR: + count += 1 + continue + if word in ["et", "plus", "+"]: + count += 1 + add = True + continue + + # is current word a numeric number? + if word.isdigit(): + val = int(word) + count += 1 + elif is_numeric(word): + val = float(word) + count += 1 + elif wordPrev in _ARTICLES_FR and _get_ordinal_fr(word): + val = _get_ordinal_fr(word) + count += 1 + # is current word the denominator of a fraction? + elif is_fractional_fr(word): + val = is_fractional_fr(word) + count += 1 + + # is current word the numerator of a fraction? + if val and wordNext: + valNext = is_fractional_fr(wordNext) + if valNext: + val = float(val) * valNext + count += 1 + + if not val: + count += 1 + # is current word a numeric fraction like "2/3"? + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + # is current word followed by a decimal value? + if wordNext == "virgule": + zeros = 0 + newWords = aWords[count + 1:] + # count the number of zeros after the decimal sign + for word in newWords: + if word == "zéro" or word == "0": + zeros += 1 + else: + break + afterDotVal = None + # extract the number after the zeros + if newWords[zeros].isdigit(): + afterDotVal = newWords[zeros] + countDot = count + zeros + 2 + # if a number was extracted (since comma is also a + # punctuation sign) + if afterDotVal: + count = countDot + if not val: + val = 0 + # add the zeros + afterDotString = zeros * "0" + afterDotVal + val = float(str(val) + "." + afterDotString) + if val: + if add: + result += val + add = False + else: + result = val + + return result or False + + +def extract_datetime_fr(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + """ + s = normalize_fr(s, True) + wordList = s.split() + for idx, word in enumerate(wordList): + # remove comma and dot if it's not a number + if word[-1] in [",", "."]: + word = word[:-1] + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or dayOffset or + (isTime and (hrAbs or minAbs)) or + hrOffset != 0 or minOffset != 0 or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ["matin", "après-midi", "soir", "nuit"] + words_in = ["dans", "après"] + markers = ["à", "dès", "autour", "vers", "environs", "ce", + "cette"] + words_in + days = ["lundi", "mardi", "mercredi", + "jeudi", "vendredi", "samedi", "dimanche"] + months = ["janvier", "février", "mars", "avril", "mai", "juin", + "juillet", "août", "septembre", "octobre", "novembre", + "décembre"] + monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû", + "sept", "oct", "nov", "déc"] + # needed for format functions + months_en = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + used = 1 + if wordPrev in ["ce", "cet", "cette"]: + used = 2 + start -= 1 + # parse aujourd'hui, demain, après-demain + elif word == "aujourd'hui" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "demain" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "après-demain" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine + elif word in ["jour", "jours"]: + if wordPrev.isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + # "3e jour" + elif _get_ordinal_fr(wordPrev) is not None: + dayOffset += _get_ordinal_fr(wordPrev) - 1 + start -= 1 + used = 2 + elif word in ["semaine", "semaines"] and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordNext in ["prochaine", "suivante"]: + dayOffset = 7 + used = 2 + elif wordNext in ["dernière", "précédente"]: + dayOffset = -7 + used = 2 + # parse 10 mois, mois prochain, mois dernier + elif word == "mois" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext in ["prochain", "suivant"]: + monthOffset = 1 + used = 2 + elif wordNext in ["dernier", "précédent"]: + monthOffset = -1 + used = 2 + # parse 5 ans, an prochain, année dernière + elif word in ["an", "ans", "année", "années"] and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]: + yearOffset = 1 + used = 2 + elif wordNext in ["dernier", "dernière", "précédent", + "précédente"]: + yearOffset = -1 + used = 2 + # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext in ["prochain", "suivant"]: + dayOffset += 7 + used += 1 + elif wordNext in ["dernier", "précédent"]: + dayOffset -= 7 + used += 1 + # parse 15 juillet, 15 juil + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months_en[m] + if wordPrev and (wordPrev[0].isdigit()): + datestr += " " + wordPrev + start -= 1 + used += 1 + else: + datestr += " 1" + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 jours après demain, 10 semaines après jeudi prochain, + # 2 mois après juillet + validFollowups = days + months + monthsShort + validFollowups.append("aujourd'hui") + validFollowups.append("demain") + validFollowups.append("prochain") + validFollowups.append("prochaine") + validFollowups.append("suivant") + validFollowups.append("suivante") + validFollowups.append("dernier") + validFollowups.append("dernière") + validFollowups.append("précédent") + validFollowups.append("précédente") + validFollowups.append("maintenant") + if word in ["après", "depuis"] and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "demain": + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "prochain": + tmpOffset += 7 + used += 1 + elif wordNextNext == "dernier": + tmpOffset -= 7 + used += 1 + elif tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] in ["ce", "cette"]: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + ampm = "" + isTime = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + used = 0 + start = idx + + # parse midi et quart, minuit et demi, midi 10, minuit moins 20 + if word in ["midi", "minuit"]: + isTime = True + if word == "midi": + hrAbs = 12 + used += 1 + elif word == "minuit": + hrAbs = 0 + used += 1 + if wordNext.isdigit(): + minAbs = int(wordNext) + used += 1 + elif wordNext == "et": + if wordNextNext == "quart": + minAbs = 15 + used += 2 + elif wordNextNext == "demi": + minAbs = 30 + used += 2 + elif wordNext == "moins": + if wordNextNext.isdigit(): + minAbs = 60 - int(wordNextNext) + if not hrAbs: + hrAbs = 23 + else: + hrAbs -= 1 + used += 2 + if wordNextNext == "quart": + minAbs = 45 + if not hrAbs: + hrAbs = 23 + else: + hrAbs -= 1 + used += 2 + # parse une demi-heure, un quart d'heure + elif word == "demi-heure" or word == "heure" and \ + (wordPrevPrev in markers or wordPrevPrevPrev in markers): + used = 1 + isTime = True + if word == "demi-heure": + minOffset = 30 + elif wordPrev == "quart": + minOffset = 15 + used += 1 + start -= 1 + elif wordPrev == "quarts" and wordPrevPrev.isdigit(): + minOffset = int(wordPrevPrev) * 15 + used += 1 + start -= 1 + if wordPrev.isdigit() or wordPrevPrev.isdigit(): + start -= 1 + used += 1 + # parse 5:00 du matin, 12:00, etc + elif word[0].isdigit() and _get_ordinal_fr(word) is None: + isTime = True + if ":" in word or "h" in word or "min" in word: + # parse hours on short format + # "3:00 du matin", "4h14", "3h15min" + strHH = "" + strMM = "" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + used = 1 + elif word[i] in [":", "h", "m"]: + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + used = 1 + else: + stage = 2 + if word[i:i + 3] == "min": + i += 1 + elif stage == 2: + break + if wordPrev in words_in: + hrOffset = int(strHH) if strHH else 0 + minOffset = int(strMM) if strMM else 0 + else: + hrAbs = int(strHH) if strHH else 0 + minAbs = int(strMM) if strMM else 0 + else: + # try to parse time without colons + # 5 hours, 10 minutes etc. + length = len(word) + ampm = "" + if ( + word.isdigit() and + wordNext in ["heures", "heure"] and word != "0" and + ( + int(word) < 100 or + int(word) > 2400 + )): + # "dans 3 heures", "à 3 heures" + if wordPrev in words_in: + hrOffset = int(word) + else: + hrAbs = int(word) + used = 2 + idxHr = idx + 2 + # "dans 1 heure 40", "à 1 heure 40" + if idxHr < len(words): + # "3 heures 45" + if words[idxHr].isdigit(): + if wordPrev in words_in: + minOffset = int(words[idxHr]) + else: + minAbs = int(words[idxHr]) + used += 1 + idxHr += 1 + # "3 heures et quart", "4 heures et demi" + elif words[idxHr] == "et" and idxHr + 1 < len(words): + if words[idxHr + 1] == "quart": + if wordPrev in words_in: + minOffset = 15 + else: + minAbs = 15 + used += 2 + idxHr += 2 + elif words[idxHr + 1] == "demi": + if wordPrev in words_in: + minOffset = 30 + else: + minAbs = 30 + used += 2 + idxHr += 2 + # "5 heures moins 20", "6 heures moins le quart" + elif words[idxHr] == "moins" and \ + idxHr + 1 < len(words): + if words[idxHr + 1].isdigit(): + if wordPrev in words_in: + hrOffset -= 1 + minOffset = 60 - int(words[idxHr + 1]) + else: + hrAbs = hrAbs - 1 + minAbs = 60 - int(words[idxHr + 1]) + used += 2 + idxHr += 2 + elif words[idxHr + 1] == "quart": + if wordPrev in words_in: + hrOffset -= 1 + minOffset = 45 + else: + hrAbs = hrAbs - 1 + minAbs = 45 + used += 2 + idxHr += 2 + # remove word minutes if present + if idxHr < len(words) and \ + words[idxHr] in ["minutes", "minute"]: + used += 1 + idxHr += 1 + elif wordNext == "minutes": + # "dans 10 minutes" + if wordPrev in words_in: + minOffset = int(word) + else: + minAbs = int(word) + used = 2 + elif wordNext == "secondes": + # "dans 5 secondes" + secOffset = int(word) + used = 2 + elif int(word) > 100: + # format militaire + hrAbs = int(word) / 100 + minAbs = int(word) - hrAbs * 100 + used = 1 + if wordNext == "heures": + used += 1 + + # handle am/pm + if timeQualifier: + if timeQualifier == "matin": + ampm = "am" + elif timeQualifier == "après-midi": + ampm = "pm" + elif timeQualifier == "soir": + ampm = "pm" + elif timeQualifier == "nuit": + if (hrAbs or 0) > 8: + ampm = "pm" + else: + ampm = "am" + hrAbs = ((hrAbs or 0) + 12 if ampm == "pm" and (hrAbs or 0) < 12 + else hrAbs) + hrAbs = ((hrAbs or 0) - 12 if ampm == "am" and (hrAbs or 0) >= 12 + else hrAbs) + if (hrAbs or 0) > 24 or ((minAbs or 0) > 59): + isTime = False + used = 0 + elif wordPrev in words_in: + isTime = False + else: + isTime = True + + elif not hrAbs and timeQualifier: + if timeQualifier == "matin": + hrAbs = 8 + elif timeQualifier == "après-midi": + hrAbs = 15 + elif timeQualifier == "soir": + hrAbs = 19 + elif timeQualifier == "nuit": + hrAbs = 2 + isTime = True + + if used > 0: + # removed parsed words from the sentence + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + if not hasYear: + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=gettz("UTC")) + temp = temp.astimezone(extractedDate.tzinfo) + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "et" and words[idx - 1] == "" and \ + words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def is_fractional_fr(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + """ + input_str = input_str.lower() + + if input_str != "tiers" and input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "quarts" + + aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième", + "septième", "huitième", "neuvième", "dixième", "onzième", + "douzième", "treizième", "quatorzième", "quinzième", "seizième", + "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"] + + if input_str in aFrac: + return 1.0 / (aFrac.index(input_str) + 1) + if _get_ordinal_fr(input_str): + return 1.0 / _get_ordinal_fr(input_str) + if input_str == "trentième": + return 1.0 / 30 + if input_str == "centième": + return 1.0 / 100 + if input_str == "millième": + return 1.0 / 1000 + + return False + + +def normalize_fr(text, remove_articles=True): + """ French string normalization """ + text = text.lower() + words = text.split() # this also removed extra spaces + normalized = "" + i = 0 + while i < len(words): + # remove articles + if remove_articles and words[i] in _ARTICLES_FR: + i += 1 + continue + if remove_articles and words[i][:2] in ["l'", "d'"]: + words[i] = words[i][2:] + # remove useless punctuation signs + if words[i] in ["?", "!", ";", "…"]: + i += 1 + continue + # Normalize ordinal numbers + if i > 0 and words[i - 1] in _ARTICLES_FR: + result = _number_ordinal_fr(words, i) + if result is not None: + val, i = result + normalized += " " + str(val) + continue + # Convert numbers into digits + result = _number_parse_fr(words, i) + if result is not None: + val, i = result + normalized += " " + str(val) + continue + + normalized += " " + words[i] + i += 1 + + return normalized[1:] # strip the initial space + + +def extract_numbers_fr(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, + short_scale=short_scale, ordinals=ordinals) + + +class FrenchNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_hu.py b/lingua_franca/lang/parse_hu.py new file mode 100644 index 0000000..9810975 --- /dev/null +++ b/lingua_franca/lang/parse_hu.py @@ -0,0 +1,26 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import Normalizer + + +class HungarianNormalizer(Normalizer): + """ TODO implement language specific normalizer""" + + +def normalize_hu(text, remove_articles=True): + """ English string normalization """ + return HungarianNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/lang/parse_it.py b/lingua_franca/lang/parse_it.py new file mode 100644 index 0000000..88c7455 --- /dev/null +++ b/lingua_franca/lang/parse_it.py @@ -0,0 +1,1171 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" + Parse functions for Italian (IT-IT) + +""" + +import collections +from datetime import datetime +from dateutil.relativedelta import relativedelta +from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + extract_numbers_generic, Normalizer +from lingua_franca.lang.format_it import _LONG_SCALE_IT, _SHORT_SCALE_IT, \ + pronounce_number_it +from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ + _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT + + +def is_fractional_it(input_str, short_scale=False): + """ + This function takes the given text and checks if it is a fraction. + Updated to italian from en version 18.8.9 + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + input_str = input_str.lower() + if input_str.endswith('i', -1) and len(input_str) > 2: + input_str = input_str[:-1] + "o" # normalizza plurali + + fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2} + + if short_scale: + for num in _SHORT_ORDINAL_STRING_IT: + if num > 2: + fracts_it[_SHORT_ORDINAL_STRING_IT[num]] = num + else: + for num in _LONG_ORDINAL_STRING_IT: + if num > 2: + fracts_it[_LONG_ORDINAL_STRING_IT[num]] = num + + if input_str in fracts_it: + return 1.0 / fracts_it[input_str] + return False + + +def _extract_number_long_it(word): + """ + This function converts a long textual number like + milleventisette -> 1027 diecimila -> 10041 in + integer value, covers from 0 to 999999999999999 + for now limited to 999_e21 but ready for 999_e63 + example: + milleventisette -> 1027 + diecimilaquarantuno-> 10041 + centottomiladuecentotredici -> 108213 + Args: + word (str): the word to convert in number + Returns: + (bool) or (int): The extracted number or False if no number + was found + """ + + units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4, + 'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9} + + tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40, + 'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80, + 'novanta': 90} + + tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50, + 'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90} + + nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14, + 'quindici': 15, 'sedici': 16, 'diciassette': 17, + 'diciotto': 18, 'diciannove': 19} + + multipli_it = collections.OrderedDict([ + # (1e63, 'deciliardi'), + # (1e60, 'decilioni'), + # (1e57, 'noviliardi'), + # (1e54, 'novilioni'), + # (1e51, 'ottiliardi'), + # (1e48, 'ottilioni'), + # (1e45, 'settiliardi'), + # (1e42, 'settilioni'), + # (1e39, 'sestiliardi'), + # (1e36, 'sestilioni'), + # (1e33, 'quintiliardi'), + # (1e30, 'quintilioni'), + # (1e27, 'quadriliardi'), + # (1e24, 'quadrilioni'), # yotta + (1e21, 'triliardi'), # zetta + (1e18, 'trilioni'), # exa + (1e15, 'biliardi'), # peta + (1e12, 'bilioni'), # tera + (1e9, 'miliardi'), # giga + (1e6, 'milioni') # mega + ]) + + multiplier = {} + un_multiplier = {} + + for num in multipli_it: + if num > 1000 and num <= 1e21: + # plurali + multiplier[multipli_it[num]] = int(num) + # singolari - modificare per eccezioni *liardo + if multipli_it[num][-5:-1] == 'iard': + un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num) + else: + un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num) + + value = False + + # normalizza ordinali singoli o plurali -esimo -esimi + if word[-5:-1] == 'esim': + base = word[:-5] + normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'} + normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e', + 'ov': 'e'} + + if base[-3:] in normalize_ita3: + base += normalize_ita3[base[-3:]] + elif base[-2:] in normalize_ita2: + base += normalize_ita2[base[-2:]] + + word = base + + for item in un_multiplier: + components = word.split(item, 1) + if len(components) == 2: + if not components[0]: # inizia con un1^x + if not components[1]: # unmilione + word = str(int(un_multiplier[item])) + else: # unmilione + x + word = str(int(un_multiplier[item]) + + _extract_number_long_it(components[1])) + + for item in multiplier: + components = word.split(item, 1) + if len(components) == 2: + if not components[0]: # inizia con un1^x + word = str(int(multiplier[item]) + + _extract_number_long_it(components[1])) + else: + if not components[1]: + word = str(_extract_number_long_it(components[0])) + '*' \ + + str(int(multiplier[item])) + else: + word = str(_extract_number_long_it(components[0])) + '*' \ + + str(int(multiplier[item])) + '+' \ + + str(_extract_number_long_it(components[1])) + + for item in tens: + word = word.replace(item, '+' + str(tens[item])) + + for item in tens_short: + word = word.replace(item, '+' + str(tens_short[item])) + + for item in nums_long: + word = word.replace(item, '+' + str(nums_long[item])) + + word = word.replace('cento', '+1xx') + word = word.replace('cent', '+1xx') + word = word.replace('mille', '+1000') # unmilionemille + word = word.replace('mila', '*1000') # unmilioneduemila + + for item in units: + word = word.replace(item, '+' + str(units[item])) + + # normalizzo i cento + occorrenze = word.count('+1xx') + for _ in range(0, occorrenze): + components = word.rsplit('+1xx', 1) + if len(components[0]) > 1 and components[0].endswith('0'): + word = components[0] + '+100' + components[1] + else: + word = components[0] + '*100' + components[1] + + components = word.rsplit('*1000', 1) + if len(components) == 2: + if components[0].startswith('*'): # centomila + components[0] = components[0][1:] + word = str(_extract_number_long_it(components[0])) + \ + '*1000' + str(components[1]) + + # gestione eccezioni + if word.startswith('*') or word.startswith('+'): + word = word[1:] + + addends = word.split('+') + for c, _ in enumerate(addends): + if '*' in addends[c]: + factors = addends[c].split('*') + result = int(factors[0]) * int(factors[1]) + if len(factors) == 3: + result *= int(factors[2]) + addends[c] = str(result) + + # check if all token are numbers + if all([s.isdecimal() for s in addends]): + value = sum([int(s) for s in addends]) + else: + value = False + return value + + +def extract_number_it(text, short_scale=False, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + + text = text.lower() + string_num_ordinal_it = {} + # first, second... + if ordinals: + if short_scale: + for num in _SHORT_ORDINAL_STRING_IT: + num_string = _SHORT_ORDINAL_STRING_IT[num] + string_num_ordinal_it[num_string] = num + _STRING_NUM_IT[num_string] = num + else: + for num in _LONG_ORDINAL_STRING_IT: + num_string = _LONG_ORDINAL_STRING_IT[num] + string_num_ordinal_it[num_string] = num + _STRING_NUM_IT[num_string] = num + + # negate next number (-2 = 0 - 2) + negatives = ['meno'] # 'negativo' non è usuale in italiano + + # multiply the previous number (one hundred = 1 * 100) + multiplies = ['decina', 'decine', 'dozzina', 'dozzine', + 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila'] + + # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) + fraction_marker = [' e '] + + # decimal marker ( 1 point 5 = 1 + 0.5) + decimal_marker = [' punto ', ' virgola '] + + if short_scale: + for num in _SHORT_SCALE_IT: + num_string = _SHORT_SCALE_IT[num] + _STRING_NUM_IT[num_string] = num + multiplies.append(num_string) + else: + for num in _LONG_SCALE_IT: + num_string = _LONG_SCALE_IT[num] + _STRING_NUM_IT[num_string] = num + multiplies.append(num_string) + + # 2 e 3/4 ed altri casi + for separator in fraction_marker: + components = text.split(separator) + zeros = 0 + + if len(components) == 2: + # count zeros in fraction part + sub_components = components[1].split(' ') + for element in sub_components: + if element == 'zero' or element == '0': + zeros += 1 + else: + break + # ensure first is not a fraction and second is a fraction + num1 = extract_number_it(components[0]) + num2 = extract_number_it(components[1]) + if num1 is not None and num2 is not None \ + and num1 >= 1 and 0 < num2 < 1: + return num1 + num2 + # sette e quaranta sette e zero zero due + elif num1 is not None and num2 is not None \ + and num1 >= 1 and num2 > 1: + return num1 + num2 / pow(10, len(str(num2)) + zeros) + + # 2 punto 5 + for separator in decimal_marker: + zeros = 0 + # count zeros in fraction part + components = text.split(separator) + + if len(components) == 2: + sub_components = components[1].split(' ') + for element in sub_components: + if element == 'zero' or element == '0': + zeros += 1 + else: + break + + number = int(extract_number_it(components[0])) + decimal = int(extract_number_it(components[1])) + if number is not None and decimal is not None: + if '.' not in str(decimal): + return number + decimal / pow(10, + len(str(decimal)) + zeros) + + all_words = text.split() + val = False + prev_val = None + to_sum = [] + for idx, word in enumerate(all_words): + + if not word: + continue + prev_word = all_words[idx - 1] if idx > 0 else '' + next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' + + # is this word already a number ? + if is_numeric(word): + val = float(word) + + # is this word the name of a number ? + if word in _STRING_NUM_IT: + val = _STRING_NUM_IT[word] + + # tre quarti un quarto trenta secondi + if is_fractional_it(word) and prev_val: + if word[:-1] == 'second' and not ordinals: + val = prev_val * 2 + else: + val = prev_val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # mezza tazza + if val is False: + val = is_fractional_it(word, short_scale=short_scale) + + # 2 quinti + if not ordinals: + next_value = is_fractional_it(next_word, short_scale=short_scale) + if next_value: + if not val: + val = 1 + val = val * next_value + + # is this a negative number? + if val and prev_word and prev_word in negatives: + val = 0 - val + + if not val: + val = _extract_number_long_it(word) + + # let's make sure it isn't a fraction + if not val: + # look for fractions like '2/3' + all_pieces = word.split('/') + if look_for_fractions(all_pieces): + val = float(all_pieces[0]) / float(all_pieces[1]) + else: + prev_val = val + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + if word in multiplies and next_word not in multiplies: + to_sum.append(val) + val = 0 + prev_val = 0 + elif _extract_number_long_it(word) > 100 and \ + _extract_number_long_it(next_word) and \ + next_word not in multiplies: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None: + for addend in to_sum: + val = val + addend + return val + + +def normalize_it(text, remove_articles=True): + """ IT string normalization """ + # replace ambiguous words + text = text.replace('un paio', 'due') + + words = text.split() # this also removed extra spaces + # Contractions are not common in IT + # Convert numbers into digits, e.g. 'quarantadue' -> '42' + normalized = '' + i = 0 + + while i < len(words): + word = words[i] + # remove articles + # Italian requires the article to define the grammatical gender + if remove_articles and word in _ARTICLES_IT: + i += 1 + continue + + if word in _STRING_NUM_IT: + word = str(_STRING_NUM_IT[word]) + + val = int(extract_number_it(word)) # era extractnumber_long_it + + if val: + word = str(val) + + normalized += ' ' + word + i += 1 + # indefinite articles in it-it can not be removed + + return normalized[1:] + + +def extract_datetime_it(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + Normalize italian plurals + """ + symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\''] + + for word in symbols: + s = s.replace(word, '') + + s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'") \ + .replace('é', "e'").replace('ì', 'i').replace('ù', 'u') \ + .replace('ò', 'o').replace('-', ' ').replace('_', '') + + # normalizza plurali per semplificare analisi + s = s.replace('secondi', 'secondo').replace('minuti', 'minuto') \ + .replace('ore', 'ora').replace('giorni', 'giorno') \ + .replace('settimane', 'settimana').replace('mesi', 'mese') \ + .replace('anni', 'anno').replace('mattino', 'mattina') \ + .replace('prossima', 'prossimo').replace('questa', 'questo') \ + .replace('quarti', 'quarto').replace('in punto', 'in_punto') \ + .replace('decennio', 'decenni').replace('secoli', 'secolo') \ + .replace('millennio', 'millenni').replace(' un ', ' uno ') \ + .replace('scorsa', 'scorso').replace('passata', 'passato') \ + .replace('uno paio', 'due') + + noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo', + 'le', 'alle', 'alla', 'dai', 'delle', 'della', + 'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel', + 'nello', 'dallo', ' '] + + word_list = s.split() + word_list = [x for x in word_list if x not in noise_words] + # normalizza alcuni formati orari + for idx in range(0, len(word_list) - 1): + if word_list[idx][0].isdigit() and word_list[idx + 1][0].isdigit(): + num0 = int(word_list[idx]) + num1 = int(word_list[idx + 1]) + if 0 <= num0 <= 23 and 10 <= num1 <= 59: + word_list[idx] = str(num0) + ':' + str(num1) + word_list[idx + 1] = '' + + word_list = [x for x in word_list if x] + + return word_list + + def date_found(): + return found or \ + (datestr != '' or time_str != '' or year_offset != 0 or + month_offset != 0 or day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or min_abs or sec_offset != 0) + + if text == '': + return None + anchorDate = anchorDate or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchorDate.strftime('%w') + current_year = anchorDate.strftime('%Y') + from_flag = False + datestr = '' + has_year = False + time_qualifier = '' + time_qualifiers_am = ['mattina', 'stamani', 'stamane'] + time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte'] + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro'] + days = ['lunedi', 'martedi', 'mercoledi', + 'giovedi', 'venerdi', 'sabato', 'domenica'] + months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', + 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', + 'dicembre'] + months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', + 'set', 'ott', 'nov', 'dic'] + year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni + time_multiples = ['ora', 'minuto', 'secondo'] + day_multiples = ['settimana', 'mese', 'anno'] + noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del', + 'l', 'in_punto', ' ', 'nella', 'dell'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == '': + continue + word_prev_prev = words[idx - 2] if idx > 1 else '' + word_prev = words[idx - 1] if idx > 0 else '' + word_next = words[idx + 1] if idx + 1 < len(words) else '' + word_next_next = words[idx + 2] if idx + 2 < len(words) else '' + start = idx + used = 0 + # save timequalifier for later + if word == 'adesso' and not datestr: + # word == 'ora' va in conflitto con 'tra un ora' + words = [x for x in words if x != 'adesso'] + words = [x for x in words if x] + result_str = ' '.join(words) + extracted_date = anchorDate.replace(microsecond=0) + return [extracted_date, result_str] + + # un paio di o tra tre settimane --> secoli + elif extract_number_it(word) and (word_next in year_multiples or + word_next in day_multiples): + multiplier = int(extract_number_it(word)) + used += 2 + if word_next == 'decenni': + year_offset = multiplier * 10 + elif word_next == 'secolo': + year_offset = multiplier * 100 + elif word_next == 'millenni': + year_offset = multiplier * 1000 + elif word_next == 'anno': + year_offset = multiplier + elif word_next == 'mese': + month_offset = multiplier + elif word_next == 'settimana': + day_offset = multiplier * 7 + elif word in time_qualifiers_list: + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == 'oggi' and not from_flag: + day_offset = 0 + used += 1 + elif word == 'domani' and not from_flag: + day_offset = 1 + used += 1 + elif word == 'ieri' and not from_flag: + day_offset -= 1 + used += 1 + elif word == 'dopodomani' and not from_flag: # after tomorrow + day_offset += 2 + used += 1 + elif word == 'dopo' and word_next == 'domani' and not from_flag: + day_offset += 1 + used += 2 + elif word == 'giorno': + if word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + if word_next == 'dopo' and word_next_next == 'domani': + day_offset += 1 + used += 2 + elif word == 'settimana' and not from_flag: + if word_prev == 'prossimo': + day_offset = 7 + start -= 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + day_offset = -7 + start -= 1 + used = 2 + elif word_next == 'prossimo': + day_offset = 7 + used += 2 + elif word_next == 'passato' or word_next == 'scorso': + day_offset = -7 + used += 2 + # parse next month, last month + elif word == 'mese' and not from_flag: + if word_prev == 'prossimo': + month_offset = 1 + start -= 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + month_offset = -1 + start -= 1 + used = 2 + elif word_next == 'prossimo': + month_offset = 1 + used += 2 + elif word_next == 'passato' or word_next == 'scorso': + month_offset = -1 + used += 2 + # parse next year, last year + elif word == 'anno' and not from_flag: + if word_prev == 'prossimo': # prossimo anno + year_offset = 1 + start -= 1 + used = 2 + elif word_next == 'prossimo': # anno prossimo + year_offset = 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + year_offset = -1 + start -= 1 + used = 2 + elif word_next == 'passato' or word_next == 'scorso': + year_offset = -1 + used = 2 + elif word == 'decenni' and not from_flag: + if word_prev == 'prossimo': # prossimo mese + year_offset = 10 + start -= 1 + used = 2 + elif word_next == 'prossimo': # mese prossimo + year_offset = 10 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + year_offset = -10 + start -= 1 + used = 2 + elif word_next == 'passato' or word_next == 'scorso': + year_offset = -10 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + ddd = days.index(word) + day_offset = (ddd + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev == 'prossimo': + day_offset += 7 + start -= 1 + used += 1 + elif word_prev == 'passato' or word_prev == 'scorso': + day_offset -= 7 + start -= 1 + used += 1 + if word_next == 'prossimo': + day_offset += 7 + used += 1 + elif word_next == 'passato' or word_next == 'scorso': + day_offset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in months_short and not from_flag: + try: + mmm = months.index(word) + except ValueError: + mmm = months_short.index(word) + used += 1 + datestr = months[mmm] + if word_prev and extract_number_it(word_prev): + datestr += ' ' + str(int(extract_number_it(word_prev))) + start -= 1 + used += 1 + if word_next and extract_number_it(word_next): + datestr += ' ' + str(int(extract_number_it(word_next))) + used += 1 + has_year = True + else: + has_year = False + elif word_next and word_next[0].isdigit(): + datestr += ' ' + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + datestr += ' ' + word_next_next + used += 1 + has_year = True + else: + has_year = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + months_short + validFollowups.append('oggi') + validFollowups.append('domani') + validFollowups.append('prossimo') + validFollowups.append('passato') + validFollowups.append('adesso') + + if (word == 'da' or word == 'dopo') and word_next in validFollowups: + used = 0 + from_flag = True + if word_next == 'domani': + day_offset += 1 + used += 2 + elif word_next == 'oggi' or word_next == 'adesso': + used += 2 + elif word_next in days: + ddd = days.index(word_next) + tmp_offset = (ddd + 1) - int(today) + used += 2 + if tmp_offset < 0: + tmp_offset += 7 + if word_next_next == 'prossimo': + tmp_offset += 7 + used += 1 + elif word_next_next == 'passato' or word_next_next == 'scorso': + tmp_offset = (ddd + 1) - int(today) + used += 1 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + ddd = days.index(word_next_next) + tmp_offset = (ddd + 1) - int(today) + if word_next == 'prossimo': + tmp_offset += 7 + # elif word_next == 'passato' or word_next == 'scorso': + # tmp_offset -= 7 + day_offset += tmp_offset + used += 3 + + if used > 0: + if start - 1 > 0 and words[start - 1] == 'questo': + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = '' + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = '' + found = True + day_specified = True + + # parse time + time_str = '' + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + + for idx, word in enumerate(words): + if word == '': + continue + word_prev_prev = words[idx - 2] if idx > 1 else '' + word_prev = words[idx - 1] if idx > 0 else '' + word_next = words[idx + 1] if idx + 1 < len(words) else '' + word_next_next = words[idx + 2] if idx + 2 < len(words) else '' + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == 'mezzogiorno': + hr_abs = 12 + used += 1 + elif word == 'mezzanotte': + hr_abs = 24 + used += 1 + if word == 'mezzo' and word_next == 'giorno': + hr_abs = 12 + used += 2 + elif word == 'mezza' and word_next == 'notte': + hr_abs = 24 + used += 2 + elif word == 'mattina': + if not hr_abs: + hr_abs = 8 + used += 1 + if word_next and word_next[0].isdigit(): # mattina alle 5 + hr_abs = int(word_next) + used += 1 + elif word == 'pomeriggio': + if not hr_abs: + hr_abs = 15 + used += 1 + if word_next and word_next[0].isdigit(): # pomeriggio alle 5 + hr_abs = int(word_next) + used += 1 + if (hr_abs or 0) < 12: + hr_abs = (hr_abs or 0) + 12 + elif word == 'sera': + if not hr_abs: + hr_abs = 19 + used += 1 + if word_next and word_next[0].isdigit() \ + and ':' not in word_next: + hr_abs = int(word_next) + used += 1 + if (hr_abs or 0) < 12: + hr_abs = (hr_abs or 0) + 12 + # da verificare più a fondo + elif word == 'presto': + hr_abs -= 1 + used += 1 + elif word == 'tardi': + hr_abs += 1 + used += 1 + # un paio di minuti tra cinque minuti tra 5 ore + elif extract_number_it(word) and (word_next in time_multiples): + d_time = int(extract_number_it(word)) + used += 2 + if word_next == 'ora': + hr_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word_next == 'minuto': + min_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word_next == 'secondo': + sec_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word == 'mezzora': + min_offset = 30 + used = 1 + isTime = False + hr_abs = -1 + min_abs = -1 + # if word_prev == 'uno' or word_prev == 'una': + # start -= 1 + # used += 1 + elif extract_number_it(word) and word_next and \ + word_next == 'quarto' and word_next_next == 'ora': + if int(extract_number_it(word)) == 1 \ + or int(extract_number_it(word)) == 3: + min_offset = 15 * int(extract_number_it(word)) + else: # elimina eventuali errori + min_offset = 15 + used = 3 + start -= 1 + isTime = False + hr_abs = -1 + min_abs = -1 + elif word[0].isdigit(): + isTime = True + str_hh = '' + str_mm = '' + remainder = '' + if ':' in word: + # parse colons + # '3:00 in the morning' + components = word.split(':') + if len(components) == 2: + num0 = int(extract_number_it(components[0])) + num1 = int(extract_number_it(components[1])) + if num0 is not False and num1 is not False \ + and 0 <= num0 <= 23 and 0 <= num1 <= 59: + str_hh = str(num0) + str_mm = str(num1) + elif 0 < int(extract_number_it(word)) < 24 \ + and word_next != 'quarto': + str_hh = str(int(word)) + str_mm = '00' + elif 100 <= int(word) <= 2400: + str_hh = int(word) / 100 + str_mm = int(word) - str_hh * 100 + military = True + isTime = False + if extract_number_it(word) and word_next \ + and word_next == 'quarto' and word_next_next != 'ora': + if int(extract_number_it(word)) == 1 \ + or int(extract_number_it(word)) == 3: + str_mm = str(15 * int(extract_number_it(word))) + else: # elimina eventuali errori + str_mm = '0' + str_hh = str(hr_abs) + used = 2 + words[idx + 1] = '' + isTime = False + if extract_number_it(word) and word_next \ + and word_next == 'in_punto': + str_hh = str(int(extract_number_it(word))) + used = 2 + if word_next == 'pm': + remainder = 'pm' + hr_abs = int(str_hh) + min_abs = int(str_mm) + if hr_abs <= 12: + hr_abs = hr_abs + 12 + used = 2 + elif word_next == 'am': + remainder = 'am' + hr_abs = int(str_hh) + min_abs = int(str_mm) + used = 2 + elif word_next == 'mattina': + # ' 11 del mattina' + hh = int(str_hh) + mm = int(str_mm) + used = 2 + remainder = 'am' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'pomeriggio': + # ' 2 del pomeriggio' + hh = int(str_hh) + mm = int(str_mm) + if hh < 12: + hh += 12 + used = 2 + remainder = 'pm' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'sera': + # 'alle 8 di sera' + hh = int(str_hh) + mm = int(str_mm) + if hh < 12: + hh += 12 + used = 2 + remainder = 'pm' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'notte': + hh = int(str_hh) + mm = int(str_mm) + if hh > 5: + remainder = 'pm' + else: + remainder = 'am' + used = 2 + isTime = False + hr_abs = hh + min_abs = mm + # parse half an hour : undici e mezza + elif word_next and word_next == 'mezza': + hr_abs = int(str_hh) + min_abs = 30 + used = 2 + isTime = False + elif word_next and word_next == 'in_punto': + hr_abs = int(str_hh) + min_abs = 0 + str_mm = '0' + used = 2 + isTime = False + else: + # 17:30 + remainder = '' + hr_abs = int(str_hh) + min_abs = int(str_mm) + used = 1 + isTime = False + if word_prev == 'ora': + words[idx - 1] = '' + + if time_qualifier != '': + # military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh = str(int(str_hh) + 12) + else: + isTime = False + + str_hh = int(str_hh) if str_hh else 0 + str_mm = int(str_mm) if str_mm else 0 + + str_hh = str_hh + 12 if remainder == 'pm' \ + and str_hh < 12 else str_hh + str_hh = str_hh - 12 if remainder == 'am' \ + and str_hh >= 12 else str_hh + + if (not military and + remainder not in ['am', 'pm'] and + ((not day_specified) or day_offset < 1)): + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + hr_abs = str_hh + if anchorDate.hour < str_hh: + pass # No modification needed + elif anchorDate.hour < str_hh + 12: + str_hh += 12 + hr_abs = str_hh + else: + # has passed, assume the next morning + day_offset += 1 + + if time_qualifier in time_qualifiers_pm and str_hh < 12: + str_hh += 12 + + if str_hh > 24 or str_mm > 59: + isTime = False + used = 0 + if isTime: + hr_abs = str_hh * 1 + min_abs = str_mm * 1 + used += 1 + + if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or + time_qualifier == 'pomeriggio'): + hr_abs = (hr_abs or 0) + 12 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = '' + + if word_prev == 'o' or word_prev == 'oh': + words[words.index(word_prev)] = '' + + if idx > 0 and word_prev in markers: + words[idx - 1] = '' + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = '' + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchorDate.replace(microsecond=0) + + if datestr != '': + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', 'sept', 'oct', 'nov', 'dec'] + + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + + for idx, en_month in enumerate(en_months_short): + datestr = datestr.replace(months_short[idx], en_month) + + try: + temp = datetime.strptime(datestr, '%B %d') + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, '%B %d %Y') + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime('%Y')), + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and datestr == '': + if not day_specified and anchorDate > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + + words = [x for x in words if x not in noise_words_2] + words = [x for x in words if x] + result_str = ' '.join(words) + + return [extracted_date, result_str] + + +def get_gender_it(word, context=""): + """ + In Italian to define the grammatical gender of a word is necessary + analyze the article that precedes the word and not only the last + letter of the word. + """ + + gender = None + words = context.split(' ') + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_it(previous) + break + + if not gender: + if word[-1] == 'a' or word[-1] == 'e': + gender = 'f' + if word[-1] == 'o' or word[-1] == 'n' \ + or word[-1] == 'l' or word[-1] == 'i': + gender = 'm' + + return gender + + +def extract_numbers_it(text, short_scale=False, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_it, + extract_number_it, + short_scale=short_scale, ordinals=ordinals) + + +class ItalianNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_nl.py b/lingua_franca/lang/parse_nl.py new file mode 100644 index 0000000..ba19770 --- /dev/null +++ b/lingua_franca/lang/parse_nl.py @@ -0,0 +1,1339 @@ +# +# Copyright 2019 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from .parse_common import is_numeric, look_for_fractions, Token, \ + ReplaceableNumber, tokenize, partition_list, Normalizer, invert_dict +from .common_data_nl import _SHORT_ORDINAL_STRING_NL, _ARTICLES_NL, \ + _DECIMAL_MARKER_NL, _FRACTION_MARKER_NL, _LONG_ORDINAL_STRING_NL,\ + _LONG_SCALE_NL, _MULTIPLIES_LONG_SCALE_NL, _MULTIPLIES_SHORT_SCALE_NL,\ + _NEGATIVES_NL, _SHORT_SCALE_NL, _STRING_LONG_ORDINAL_NL, _STRING_NUM_NL, \ + _STRING_SHORT_ORDINAL_NL, _SUMS_NL +from lingua_franca.time import now_local +import re + + +def _convert_words_to_numbers_nl(text, short_scale=True, ordinals=False): + """Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + """ + text = text.lower() + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_nl(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_nl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """Extract all numbers from a list of _Tokens, with the representing words. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [_ReplaceableNumber]: A list of tuples, each containing a number and a + string. + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_nl(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_nl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """This function extracts a number from a list of _Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + _ReplaceableNumber + """ + number, tokens = \ + _extract_number_with_text_nl_helper(tokens, short_scale, + ordinals, fractional_numbers) + while tokens and tokens[0].word in _ARTICLES_NL: + tokens.pop(0) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_nl_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """Helper for _extract_number_with_text_nl. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [_Tokens] + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_nl(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_nl(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_nl(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_nl(tokens, short_scale, ordinals): + """Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + """ + for c in _FRACTION_MARKER_NL: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_nl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_nl(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_nl(tokens, short_scale, ordinals): + """Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extractnumber_nl, it also depends on + extractnumber_nl, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + """ + for c in _DECIMAL_MARKER_NL: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_nl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_nl(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_nl(tokens, short_scale, ordinals): + """Handle numbers not handled by the decimal or fraction functions. + + This is generally whole numbers. Note that phrases such as "one half" will + be handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [_Tokens] + The value parsed, and tokens that it corresponds to. + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data_nl(short_scale) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + if word in _ARTICLES_NL or word in _NEGATIVES_NL: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + if word not in string_num_scale and \ + word not in _STRING_NUM_NL and \ + word not in _SUMS_NL and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not is_fractional_nl(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + if number_words and not all([w in _ARTICLES_NL | + _NEGATIVES_NL for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SUMS_NL \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES_NL \ + and prev_word not in _ARTICLES_NL: + number_words = [token] + elif prev_word in _SUMS_NL and word in _SUMS_NL: + number_words = [token] + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_NL: + val = _STRING_NUM_NL.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if prev_word in _SUMS_NL and val and val < 10: + val = prev_val + val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # half cup + if val is False: + val = is_fractional_nl(word, short_scale=short_scale) + current_val = val + + # 2 fifths + if not ordinals: + next_val = is_fractional_nl(next_word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES_NL: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + + else: + if prev_word in _SUMS_NL and word not in _SUMS_NL and current_val >= 10: + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + if word in multiplies and next_word not in multiplies: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data_nl(short_scale): + """Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + """ + multiplies = _MULTIPLIES_SHORT_SCALE_NL if short_scale \ + else _MULTIPLIES_LONG_SCALE_NL + + string_num_ordinal_nl = _STRING_SHORT_ORDINAL_NL if short_scale \ + else _STRING_LONG_ORDINAL_NL + + string_num_scale_nl = _SHORT_SCALE_NL if short_scale else _LONG_SCALE_NL + string_num_scale_nl = invert_dict(string_num_scale_nl) + + return multiplies, string_num_ordinal_nl, string_num_scale_nl + + +def extract_number_nl(text, short_scale=True, ordinals=False): + """Extract a number from a text string + + The function handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + """ + return _extract_number_with_text_nl(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_nl(text): + """Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + nl_translations = { + 'microseconds': ["microsecond", "microseconde", "microseconden", "microsecondje", "microsecondjes"], + 'milliseconds': ["millisecond", "milliseconde", "milliseconden", "millisecondje", "millisecondjes"], + 'seconds': ["second", "seconde", "seconden", "secondje", "secondjes"], + 'minutes': ["minuut", "minuten", "minuutje", "minuutjes"], + 'hours': ["uur", "uren", "uurtje", "uurtjes"], + 'days': ["dag", "dagen", "dagje", "dagjes"], + 'weeks': ["week", "weken", "weekje", "weekjes"] + } + + pattern = r"(?P\d+(?:\.?\d+)?)\s+{unit}" + text = _convert_words_to_numbers_nl(text) + + for unit in time_units: + unit_nl_words = nl_translations[unit] + unit_nl_words.sort(key=len, reverse=True) + for unit_nl in unit_nl_words: + unit_pattern = pattern.format(unit=unit_nl) + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + time_units[unit] = time_units[unit] + value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_nl(text, anchorDate=None, default_time=None): + """Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + dateNow (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' de ', ' ').replace(' het ', ' ').replace(' het ', ' ') \ + .replace("paar", "2").replace("eeuwen", "eeuw") \ + .replace("decennia", "decennium") \ + .replace("millennia", "millennium") + + wordList = s.split() + for idx, word in enumerate(wordList): + ordinals = ["ste", "de"] + if word[0].isdigit(): + for ordinal in ordinals: + # "second" is the only case we should not do this + if ordinal in word and "second" not in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['ochtend'] + timeQualifiersPM = ['middag', 'avond', 'nacht'] + timeQualifiersList = timeQualifiersAM + timeQualifiersPM + timeQualifierOffsets = [8, 15, 19, 0] + markers = ['op', 'in', 'om', 'tegen', 'over', + 'deze', 'rond', 'voor', 'van', "binnen"] + days = ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", + "zaterdag", "zondag"] + day_parts = [a + b for a in days for b in timeQualifiersList] + months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', + 'juli', 'augustus', 'september', 'oktober', 'november', + 'december'] + recur_markers = days + [d+'en' for d in days] + ['weekeinde', 'werkdag', + 'weekeinden', 'werkdagen'] + months_short = ['jan', 'feb', 'mar', 'apr', 'mei', 'jun', 'jul', 'aug', + 'sep', 'okt', 'nov', 'dec'] + year_multiples = ["decennium", "eeuw", "millennium"] + day_multiples = ["dagen", "weken", "maanden", "jaren"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + + if word == "nu" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_nl(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "decennium": + yearOffset = multiplier * 10 + elif wordNext == "eeuw": + yearOffset = multiplier * 100 + elif wordNext == "millennium": + yearOffset = multiplier * 1000 + # paar + elif word == "2" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 2 + if wordNextNext == "decennia": + yearOffset = multiplier * 10 + elif wordNextNext == "eeuwen": + yearOffset = multiplier * 100 + elif wordNextNext == "millennia": + yearOffset = multiplier * 1000 + elif word == "2" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 2 + if wordNextNext == "jaren": + yearOffset = multiplier + elif wordNextNext == "maanden": + monthOffset = multiplier + elif wordNextNext == "weken": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "vandaag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "overmorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dagen": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "week" or word == "weken" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "volgende": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "vorige": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "maand" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "volgende": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "vorige": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "jaar" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "volgend": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "vorig": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "volgende": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "vorige": + dayOffset -= 7 + used += 1 + start -= 1 + elif word in day_parts and not fromFlag: + d = day_parts.index(word) / len(timeQualifiersList) + dayOffset = (d + 1) - int(today) + if dayOffset < 0: + dayOffset += 7 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in months_short and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + datestr = months[m] + if wordPrev and \ + (wordPrev[0].isdigit() or (wordPrev == "van" and + wordPrevPrev[0].isdigit())): + if wordPrev == "van" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + months_short + validFollowups.append("vandaag") + validFollowups.append("morgen") + validFollowups.append("volgende") + validFollowups.append("vorige") + validFollowups.append("nu") + if (word == "van" or word == "na") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgen": + dayOffset += 1 + elif wordNext == "overmorgen": + dayOffset += 2 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "volgende": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "vorige": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "deze": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse nacht ochtend, middag, avond + used = 0 + if word.startswith("gister"): + dayOffset = -1 + elif word.startswith("morgen"): + dayOffset = 1 + + if word.endswith("nacht"): + if hrAbs is None: + hrAbs = 0 + used += 1 + elif word.endswith("ochtend"): + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word.endswith("middag"): + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word.endswith("avond"): + if hrAbs is None: + hrAbs = 19 + used += 1 + + # "paar" time_unit + elif word == "2" and \ + wordNextNext in ["uur", "minuten", "seconden"]: + used += 2 + if wordNextNext == "uur": + hrOffset = 2 + elif wordNextNext == "minuten": + minOffset = 2 + elif wordNextNext == "seconden": + secOffset = 2 + # parse half an hour, quarter hour + elif word == "uur" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "half": + minOffset = 30 + elif wordPrev == "kwartier": + minOffset = 15 + elif wordPrevPrev == "kwartier": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + if words[idx - 3] == "deze": + daySpecified = True + words[idx - 2] = "" + elif wordPrev == "binnen": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "deze": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse "over een minuut" + elif word == "minuut" and wordPrev == "over": + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse "over een seconde" + elif word == "seconde" and wordPrev == "over": + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "vannacht" or wordNextNext == "vannacht" or \ + wordPrev == "vannacht" or wordPrevPrev == "vannacht" or \ + wordNextNextNext == "vannacht": + remainder = "pm" + used += 1 + if wordPrev == "vannacht": + words[idx - 1] = "" + if wordPrevPrev == "vannacht": + words[idx - 2] = "" + if wordNextNext == "vannacht": + used += 1 + if wordNextNextNext == "vannacht": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + elif wordNext == "in" and wordNextNext == "ochtend": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "middag": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "avond": + remainder = "pm" + used += 2 + elif wordNext == "'s" and wordNextNext == "ochtends": + remainder = "am" + used += 2 + elif wordNext == "'s" and wordNextNext == "middags": + remainder = "pm" + used += 2 + elif wordNext == "'s" and wordNextNext == "avonds": + remainder = "pm" + used += 2 + elif wordNext == "deze" and wordNextNext == "ochtend": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "deze" and wordNextNext == "middag": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "deze" and wordNextNext == "avond": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "'s" and wordNextNext == "nachts": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + (wordNext == "uren" or wordNext == "uur" or + remainder == "uren" or remainder == "uur") and + word[0] != '0' and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuten" or wordNext == "minuut" or \ + remainder == "minuten" or remainder == "minuut": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconden" or wordNext == "seconde" \ + or remainder == "seconden" or \ + remainder == "seconde": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "uur" or remainder == "uur": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "uur" or remainder == "uur"): + used += 1 + elif ( + wordNext == "" or wordNext == "uur" or + ( + wordNext == "in" and + ( + wordNextNext == "de" or + wordNextNext == timeQualifier + ) + ) or wordNext == 'vannacht' or + wordNextNext == 'vannacht'): + + strHH = strNum + strMM = "00" + if wordNext == "uur": + used += 1 + + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm', 'uren', 'minuten', + "seconde", "seconden", + "uur", "minuut"] and + ((not daySpecified) or dayOffset < 1)): + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "vroeg": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "laat": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "deze": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "deze": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate.replace(hour=hrAbs, + minute=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "en" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def is_fractional_nl(input_str, short_scale=True): + """This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + """ + fracts = {"heel": 1, "half": 2, "halve": 2, "kwart": 4} + if short_scale: + for num in _SHORT_ORDINAL_STRING_NL: + if num > 2: + fracts[_SHORT_ORDINAL_STRING_NL[num]] = num + else: + for num in _LONG_ORDINAL_STRING_NL: + if num > 2: + fracts[_LONG_ORDINAL_STRING_NL[num]] = num + + if input_str.lower() in fracts: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_nl(text, short_scale=True, ordinals=False): + """Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_nl(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +def normalize_nl(text, remove_articles=True): + """Dutch string normalization.""" + + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in _ARTICLES_NL: + continue + + # Convert numbers into digits, e.g. "two" -> "2" + textNumbers = ["nul", "een", "twee", "drie", "vier", "vijf", "zes", + "zeven", "acht", "negen", "tien", "elf", "twaalf", + "dertien", "veertien", "vijftien", "zestien", + "zeventien", "achttien", "negentien", "twintig"] + + if word in textNumbers: + word = str(textNumbers.index(word)) + + normalized += " " + word + + return normalized[1:] # strip the initial space + + +class DutchNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py new file mode 100644 index 0000000..84f83bc --- /dev/null +++ b/lingua_franca/lang/parse_pl.py @@ -0,0 +1,1404 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token +from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ + _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _FRACTION_STRING_PL, _TIME_UNITS_CONVERSION, \ + _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ + _ALT_ORDINALS_PL +from lingua_franca.time import now_local +import re + + +def generate_plurals_pl(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + if isinstance(originals, dict): + result = {key + 'y': value for key, value in originals.items()} + result = {**result, **{key + 'ów': value for key, value in originals.items()}} + result = {**result, **{'tysiące': 1000, 'tysięcy': 1000}} + + return result + + result = {value + "y" for value in originals} + result = result.union({value + "ów" for value in originals}) + result = result.union({'tysiące', 'tysięcy'}) + + return result + + +def generate_fractions_pl(fractions): + '''Returns a list of all fraction combinations. E.g.: + trzecia, trzecich, trzecie + czwarta, czwarte, czwartych + + :param fractions: Existing fractions + :return: Fractions with add suffixes + ''' + + result = {**fractions} + for k, v in fractions.items(): + k_no_last = k[:-1] + result[k_no_last + 'e'] = v + if k_no_last[-1:] == 'i': + result[k_no_last + 'ch'] = v + else: + result[k_no_last + 'ych'] = v + + for k,v in _SHORT_ORDINAL_PL.items(): + result[v[:-1] + 'a'] = k + + result['jedno'] = 1 + result['czwartego'] = 4 + + return result + + +# negate next number (-2 = 0 - 2) +_NEGATIVES = {"ujemne", "minus"} + +# sum the next number (twenty two = 20 + 2) +_SUMS = {'dwadzieścia', '20', 'trzydzieści', '30', 'czterdzieści', '40', 'pięćdziesiąt', '50', + 'sześćdziesiąt', '60', 'siedemdziesiąt', '70', 'osiemdziesiąt', '80', 'dziewięćdziesiąt', '90'} + +_MULTIPLIES_SHORT_SCALE_PL = generate_plurals_pl(_SHORT_SCALE_PL.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER = {'i'} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER = {'kropka', 'przecinek'} + +_STRING_NUM_PL = invert_dict(_NUM_STRING_PL) +_STRING_NUM_PL.update(generate_plurals_pl(_STRING_NUM_PL)) +_STRING_NUM_PL.update({ + 'pół': 0.5, + 'połówka': 0.5, + 'połowa': 0.5, +}) + +_STRING_SHORT_ORDINAL_PL = invert_dict(_SHORT_ORDINAL_PL) + +_REV_FRACTITONS = generate_fractions_pl(invert_dict(_FRACTION_STRING_PL)) + + +def _convert_words_to_numbers_pl(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + text = text.lower() + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_pl(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_pl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_pl(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_pl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_pl_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_pl_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_pl(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_pl(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_pl(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_pl(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_pl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_pl(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_pl(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extractnumber_en, it also depends on + extractnumber_en, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_pl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_pl(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_pl(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data(short_scale) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + if is_numeric(word[:-1]) and word.endswith('.'): + # explicit ordinals, 1., 2., 3., 4.... N. + word = word[:-1] + + word = normalize_word_pl(word) + + if word not in string_num_scale and \ + word not in _STRING_NUM_PL and \ + word not in _SUMS and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not isFractional_pl(word) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + if number_words and not all([w in _NEGATIVES for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SHORT_SCALE_PL.values() \ + and prev_word not in _SUMS \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES: + number_words = [token] + elif prev_word in _SUMS and word in _SUMS: + number_words = [token] + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_PL: + val = _STRING_NUM_PL.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + prev_val = None + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if prev_val: + if (prev_word in string_num_ordinal and val and val < prev_val) or \ + (prev_word in _STRING_NUM_PL and val and val < prev_val and val // 10 != prev_val // 10) or \ + all([prev_word in multiplies, val < prev_val if prev_val else False]): + val += prev_val + + if next_word in multiplies: + prev_val = val + continue + + # is this a spoken fraction? + # half cup + if val is False: + val = isFractional_pl(word) + current_val = val + + # 2 fifths + if not ordinals: + next_val = isFractional_pl(next_word) + if next_val: + if not val: + val = 1 + val *= next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES: + val = 0 - val + + if next_word in _STRING_NUM_PL: + prev_val = val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + number_words.append(tokens[idx + 1]) + else: + if all([ + prev_word in _SUMS, + word not in _SUMS, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx+1:]: + if other_token.word in multiplies: + if string_num_scale[other_token.word] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data(short_scale): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_PL + + string_num_scale = invert_dict(_SHORT_SCALE_PL) + string_num_scale.update(generate_plurals_pl(string_num_scale)) + return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale + + +def extract_number_pl(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_pl(tokenize(text.lower()), + True, ordinals).value + + +def extract_duration_pl(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': None, + 'milliseconds': None, + 'seconds': None, + 'minutes': None, + 'hours': None, + 'days': None, + 'weeks': None + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ayeę]?" + text = _convert_words_to_numbers_pl(text) + + for unit in _TIME_UNITS_CONVERSION: + unit_pattern = pattern.format(unit=unit) + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + unit_en = _TIME_UNITS_CONVERSION.get(unit) + if time_units[unit_en] is None or time_units.get(unit_en) == 0: + time_units[unit_en] = value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_pl(string, dateNow=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + string (str): string containing date words + dateNow (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("para", "2") + + wordList = s.split() + for idx, word in enumerate(wordList): + ordinals = ["ci", "szy", "gi"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if string == "": + return None + + dateNow = dateNow or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['rano'] + timeQualifiersPM = ['wieczór', 'w nocy'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['na', 'w', 'we', 'na', 'przez', 'ten', 'około', 'dla', 'o', "pomiędzy", 'za', 'do'] + days = list(_DAYS_TO_EN.keys()) + recur_markers = days + ['weekend', 'weekendy'] + monthsShort = ['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', + 'wrz', 'paź', 'lis', 'gru'] + year_multiples = ['dekada', 'wiek', 'milenia'] + + words = clean_string(string) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + # save timequalifier for later + if word == 'w' and wordNext == 'tę': + used += 2 + if word == "temu" and dayOffset: + dayOffset = - dayOffset + used += 1 + if word == "teraz" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = dateNow.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_pl(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "dekada": + yearOffset = multiplier * 10 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "wiek": + yearOffset = multiplier * 100 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "milenia": + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dzisiaj" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "jutro" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "przedwczoraj" and not fromFlag: + dayOffset = -2 + used += 1 + elif word == "wczoraj" and not fromFlag: + dayOffset = -1 + used += 1 + elif word == "pojutrze" and not fromFlag: + dayOffset = 2 + used = 1 + elif word == "dzień" and wordNext != 'robocze': + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "tydzień" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "następny": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "miesiąc" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "rok" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = _DAYS_TO_EN.get(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "następny": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in _MONTHS_TO_EN or word in monthsShort and not fromFlag: + used += 1 + datestr = _MONTHS_TO_EN[word] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + list(_MONTHS_TO_EN.keys()) + monthsShort + validFollowups.append("dzisiaj") + validFollowups.append("jutro") + validFollowups.append("wczoraj") + validFollowups.append("następny") + validFollowups.append("poprzedni") + validFollowups.append('ostatni') + validFollowups.append("teraz") + validFollowups.append("tego") + if (word == "od" or word == "po") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "jutro": + dayOffset += 1 + elif wordNext == "wczoraj": + dayOffset -= 1 + elif wordNext in days: + d = _DAYS_TO_EN.get(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = _DAYS_TO_EN.get(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "następny": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "poprzedni" or wordNext == 'ostatni': + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "ten": # this + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "południe": + hrAbs = 12 + used += 1 + elif word == "północ" or word == 'północy': + hrAbs = 0 + used += 1 + elif word == "rano": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "po" and wordNext == "południu": + if hrAbs is None: + hrAbs = 15 + used += 2 + elif word == "wieczór" or word == 'wieczorem': + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "nocy": + if hrAbs is None: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "godzina" and (wordPrev.isdigit() or wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "pół": + minOffset = 30 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "dzisiaj": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minuta" and (wordPrev.isdigit() or wordPrev in markers): + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "sekunda" and (wordPrev.isdigit() or wordPrev in markers): + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if wordNext == "wieczorem" or wordPrev == "wieczorem" or \ + wordNext == 'wieczór' or wordPrev == 'wieczór' or \ + (wordNext == 'po' and wordNextNext == 'południu'): + remainder = "pm" + used += 2 if wordNext == 'po' else 1 + if wordPrev == "wieczorem" or wordPrev == 'wieczór': + words[idx - 1] = "" + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + if wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "po" and wordNextNext == "południu": + remainder = "pm" + used += 2 + elif wordNext == "wieczorem": + remainder = "pm" + used += 1 + elif wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "w" and wordNextNext == "nocy": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + (word[0].isdigit() and (wordNext == 'wieczorem' or wordNext == 'wieczór')) or + (word[0].isdigit() and wordNext == 'po' and wordNextNext == 'południu') or + (word[0].isdigit() and wordNext == 'w' and wordNextNext == 'nocy')): + strHH = strNum + remainder = "pm" + used = 2 if wordNext in ['po', 'w'] else 1 + elif ( + remainder == "am" or + (word[0].isdigit() and wordNext == 'rano')): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers or ( + wordNext == 'w' and wordNextNext == 'dzień' and + wordNextNextNext == 'robocze' + )): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + # "in 10 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "minuta" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "minuta": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "sekunda" \ + or _TIME_UNITS_NORMALIZATION.get(remainder) == "sekunda": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + elif ( + wordNext == "" or wordNext == "w" or wordNext == 'nocy' or + wordNextNext == 'nocy'): + strHH = strNum + strMM = "00" + + if wordNext == "za" or wordNextNext == "za": + used += (1 if wordNext == "za" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm'] and + remainder not in _TIME_UNITS_NORMALIZATION and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if dateNow.hour < HH or (dateNow.hour == HH and + dateNow.minute < MM): + pass # No modification needed + elif dateNow.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "rano": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "wieczorem": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "najbliższą": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "najbliższą": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "i" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def isFractional_pl(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + lower_input = input_str.lower() + if lower_input in _REV_FRACTITONS: + return 1.0 / _REV_FRACTITONS[lower_input] + + return False + + +def extract_numbers_pl(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_pl(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +def normalize_word_pl(word): + if word.startswith('jedn'): + suffix = 'ą', 'ej', 'ym' + if word.endswith(suffix): + return 'jedna' + if word == 'dwie': + return 'dwa' + + return word + + +def normalize_pl(text, remove_articles=True): + """ Polish string normalization """ + + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in ["i"]: + continue + + if word in _TIME_UNITS_NORMALIZATION: + word = _TIME_UNITS_NORMALIZATION[word] + + if word in _REV_FRACTITONS: + word = str(_REV_FRACTITONS[word]) + + if word in _ORDINAL_BASE_PL.values(): + word = str(list(_ORDINAL_BASE_PL.keys())[list(_ORDINAL_BASE_PL.values()).index(word)]) + + if word in _NUM_STRING_PL.values(): + word = str(list(_NUM_STRING_PL.keys())[list(_NUM_STRING_PL.values()).index(word)]) + + if word in _ALT_ORDINALS_PL.values(): + word = str(list(_ALT_ORDINALS_PL.keys())[list(_ALT_ORDINALS_PL.values()).index(word)]) + + if word == 'następną' or word == 'następna' or word == 'następnym' or word == 'następnej': + word = 'następny' + elif word == 'ostatnią' or word == 'ostatnia' or word == 'ostatnim' or word == 'ostatniej' or \ + word == 'poprzednią' or word == 'poprzednia' or word == 'poprzednim' or word == 'poprzedniej': + word = 'poprzedni' + elif word == 'jutra' or word == 'jutrze': + word = 'jutro' + elif word == 'wieczorem': + word = 'wieczór' + elif word == 'poranne': + word = 'rano' + + normalized += " " + word + + return normalized[1:] # strip the initial space diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py new file mode 100644 index 0000000..a8d4f85 --- /dev/null +++ b/lingua_franca/lang/parse_pt.py @@ -0,0 +1,1089 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" + Parse functions for Portuguese (PT-PT) + + TODO: numbers greater than 999999 + TODO: date time pt +""" + +from datetime import datetime +from dateutil.relativedelta import relativedelta +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions +from lingua_franca.lang.common_data_pt import _NUMBERS_PT, \ + _FEMALE_DETERMINANTS_PT, _FEMALE_ENDINGS_PT, \ + _MALE_DETERMINANTS_PT, _MALE_ENDINGS_PT, _GENDERS_PT +from lingua_franca.internal import resolve_resource_file +from lingua_franca.lang.parse_common import Normalizer +from lingua_franca.time import now_local +import json +import re + + +def is_fractional_pt(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = ["meio", "terço", "quarto", "quinto", "sexto", + "setimo", "oitavo", "nono", "décimo"] + + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 2) + if input_str == "vigésimo": + return 1.0 / 20 + if input_str == "trigésimo": + return 1.0 / 30 + if input_str == "centésimo": + return 1.0 / 100 + if input_str == "milésimo": + return 1.0 / 1000 + if (input_str == "sétimo" or input_str == "septimo" or + input_str == "séptimo"): + return 1.0 / 7 + + return False + + +def extract_number_pt(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + text = text.lower() + aWords = text.split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in _NUMBERS_PT: + val = _NUMBERS_PT[word] + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif is_fractional_pt(word): + if not result: + result = 1 + result = result * is_fractional_pt(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + if next_word != "avos": + result += val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["e"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extract_number_pt(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + for _ in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extract_number_pt(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["ponto", "virgula", "vírgula", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extract_number_pt(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result or False + + +class PortugueseNormalizer(Normalizer): + with open(resolve_resource_file("text/pt-pt/normalize.json")) as f: + _default_config = json.load(f) + + @staticmethod + def tokenize(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Split things like amo-te + utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", + utterance) + tokens = utterance.split() + if tokens[-1] == '-': + tokens = tokens[:-1] + + return tokens + + +def normalize_pt(text, remove_articles=True): + """ PT string normalization """ + return PortugueseNormalizer().normalize(text, remove_articles) + + +def extract_datetime_pt(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", + "ao", "aos"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "á", + "a").replace( + "ç", + "c").replace( + "à", + "a").replace( + "ã", + "a").replace( + "é", + "e").replace( + "è", + "e").replace( + "ê", + "e").replace( + "ó", + "o").replace( + "ò", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonims and equivalents, "tomorrow early = tomorrow morning + synonims = {"manha": ["manhazinha", "cedo", "cedinho"], + "tarde": ["tardinha", "tarde"], + "noite": ["noitinha", "anoitecer"], + "todos": ["ao", "aos"], + "em": ["do", "da", "dos", "das", "de"]} + for syn in synonims: + for word in synonims[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", + "minutos", "segundos", "nas", "nos", "proximas", + "seguintes", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['manha', 'tarde', 'noite'] + time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", + "no", "dia", "hora"] + days = ['segunda', 'terca', 'quarta', + 'quinta', 'sexta', 'sabado', 'domingo'] + months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', + 'julho', 'agosto', 'setembro', 'outubro', 'novembro', + 'dezembro'] + monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', + 'set', 'out', 'nov', 'dec'] + nexts = ["proximo", "proxima"] + suffix_nexts = ["seguinte", "subsequente", "seguir"] + lasts = ["ultimo", "ultima"] + suffix_lasts = ["passada", "passado", "anterior", "antes"] + nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] + prevs = ["antes", "ante", "previa", "previamente", "anterior"] + froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", + "depois", "por", "proxima", "proximo", "da", "do", "de"] + thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoje" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "amanha" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ontem" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteontem" or + (word == "ante" and wordNext == "ontem")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ontem": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ontem" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "anteanteontem" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "depois" and wordNext == "amanha" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "antes" and wordNext == "ontem" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "dia": + if wordNext == "depois" or wordNext == "antes": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "ano" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maio + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maio 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maio + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maio dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoje") + validFollowups.append("amanha") + validFollowups.append("ontem") + validFollowups.append("anteontem") + validFollowups.append("agora") + validFollowups.append("ja") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "amanha" and wordNext == "ontem") and not ( + word == "depois" or word == "antes" or word == "em"): + used = 2 + fromFlag = True + if wordNext == "amanha" and word != "depois": + dayOffset += 1 + elif wordNext == "ontem": + dayOffset -= 1 + elif wordNext == "anteontem": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ontem": + dayOffset -= 2 + elif (wordNext == "ante" and wordNextNext == "ante" and + wordNextNextNext == "ontem"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "feira": + used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + if wordNextNextNext == "feira": + used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "meio" and wordNext == "dia": + hrAbs = 12 + used += 2 + elif word == "meia" and wordNext == "noite": + hrAbs = 0 + used += 2 + elif word == "manha": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "meio" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "meio" and wordNext == "manha": + if not hrAbs: + hrAbs = 10 + used += 2 + elif word == "fim" and wordNext == "tarde": + if not hrAbs: + hrAbs = 19 + used += 2 + elif word == "fim" and wordNext == "manha": + if not hrAbs: + hrAbs = 11 + used += 2 + elif word == "tantas" and wordNext == "manha": + if not hrAbs: + hrAbs = 4 + used += 2 + elif word == "noite": + if not hrAbs: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "meia": + minOffset = 30 + elif wordPrev == "quarto": + minOffset = 15 + elif wordPrevPrev == "quarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "manha": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noite": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "manha": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noite": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "manha" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "manha"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "em" and wordNextNext == "ponto"): + strHH = word + strMM = 00 + if wordNext == "em" and wordNextNext == "ponto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "manha": + remainder = "am" + used += 1 + elif wordNextNextNext == "noite": + if 0 > int(strHH) > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "em" or wordPrev == "ponto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if (hrAbs or 0) != -1 and (minAbs or 0) != -1: + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + resultStr = _pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def _pt_pruning(text, symbols=True, accents=True, agressive=True): + # agressive pt word pruning + words = ["a", "o", "os", "as", "de", "dos", "das", + "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", + "este", + "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa", "foi", "que"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", "�", "�"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("-", " ").replace("_", " ") + if accents: + accents = {"a": ["á", "à", "ã", "â"], + "e": ["ê", "è", "é"], + "i": ["í", "ì"], + "o": ["ò", "ó"], + "u": ["ú", "ù"], + "c": ["ç"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text + + +def get_gender_pt(word, context=""): + """ Guess the gender of a word + + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + # parse gender taking context into account + word = word.lower() + words = context.lower().split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + # in portuguese usually the previous word (a determinant) + # assigns gender to the next word + previous = words[idx - 1].lower() + if previous in _MALE_DETERMINANTS_PT: + return "m" + elif previous in _FEMALE_DETERMINANTS_PT: + return "f" + + # get gender using only the individual word + # see if this word has the gender defined + if word in _GENDERS_PT: + return _GENDERS_PT[word] + singular = word.rstrip("s") + if singular in _GENDERS_PT: + return _GENDERS_PT[singular] + # in portuguese the last vowel usually defines the gender of a word + # the gender of the determinant takes precedence over this rule + for end_str in _FEMALE_ENDINGS_PT: + if word.endswith(end_str): + return "f" + for end_str in _MALE_ENDINGS_PT: + if word.endswith(end_str): + return "m" + return None diff --git a/lingua_franca/lang/parse_ru.py b/lingua_franca/lang/parse_ru.py new file mode 100644 index 0000000..cd041ec --- /dev/null +++ b/lingua_franca/lang/parse_ru.py @@ -0,0 +1,1685 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_ru import _NUM_STRING_RU, \ + _LONG_ORDINAL_RU, _LONG_SCALE_RU, _SHORT_SCALE_RU, _SHORT_ORDINAL_RU, \ + _FRACTION_STRING_RU, _MONTHS_CONVERSION, _MONTHS_RU, _TIME_UNITS_CONVERSION, \ + _ORDINAL_BASE_RU + +import re +import json +from lingua_franca import resolve_resource_file +from lingua_franca.time import now_local + + +def generate_plurals_ru(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + suffixes = ["а", "ах", "ам", "ами", "ные", "ный", "ов", "ом", "ы"] + if isinstance(originals, dict): + return {key + suffix: value for key, value in originals.items() for suffix in suffixes} + return {value + suffix for value in originals for suffix in suffixes} + + +# negate next number (-2 = 0 - 2) +_NEGATIVES = {"минус"} + +# sum the next number (twenty two = 20 + 2) +_SUMS = {'двадцать', '20', 'тридцать', '30', 'сорок', '40', 'пятьдесят', '50', + 'шестьдесят', '60', 'семьдесят', '70', 'восемьдесят', '80', 'девяносто', '90', + 'сто', '100', 'двести', '200', 'триста', '300', 'четыреста', '400', + 'пятьсот', '500', 'шестьсот', '600', 'семьсот', '700', 'восемьсот', '800', + 'девятьсот', '900'} + +_MULTIPLIES_LONG_SCALE_RU = set(_LONG_SCALE_RU.values()) | \ + generate_plurals_ru(_LONG_SCALE_RU.values()) + +_MULTIPLIES_SHORT_SCALE_RU = set(_SHORT_SCALE_RU.values()) | \ + generate_plurals_ru(_SHORT_SCALE_RU.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER = {"и", "с", " "} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER = {"целая", "целых", "точка", "запятая"} + +_STRING_NUM_RU = invert_dict(_NUM_STRING_RU) +_STRING_NUM_RU.update({ + "тысяч": 1e3, +}) +_STRING_NUM_RU.update(generate_plurals_ru(_STRING_NUM_RU)) +_STRING_NUM_RU.update({ + "четверти": 0.25, + "четвёртая": 0.25, + "четвёртых": 0.25, + "третья": 1 / 3, + "третяя": 1 / 3, + "вторая": 0.5, + "вторых": 0.5, + "половина": 0.5, + "половиной": 0.5, + "пол": 0.5, + "одна": 1, + "двойка": 2, + "двое": 2, + "пара": 2, + "сот": 100, + "сотен": 100, + "сотни": 100, + "сотня": 100, +}) + +_WORDS_NEXT_RU = [ + "будущая", "будущее", "будущей", "будущий", "будущим", "будущую", + "новая", "новое", "новой", "новый", "новым", + "следующая", "следующее", "следующей", "следующем", "следующий", "следующую", +] +_WORDS_PREV_RU = [ + "предыдущая", "предыдущем", "предыдущей", "предыдущий", "предыдущим", "предыдущую", + "прошедшая", "прошедшем", "прошедшей", "прошедший", "прошедшим", "прошедшую", + "прошлая", "прошлой", "прошлом", "прошлую", "прошлый", "прошлым", + "том", "тот", +] +_WORDS_CURRENT_RU = [ + "данная", "данное", "данном", "данный", + "настойщая", "настоящее", "настойщем", "настойщем", "настойщий", + "нынешняя", "нынешнее", "нынешней", "нынешнем", "нынешний", + "текущая", "текущее", "текущей", "текущем", "текущий", + "это", "этим", "этой", "этом", "этот", "эту", +] +_WORDS_NOW_RU = [ + "теперь", + "сейчас", +] +_WORDS_MORNING_RU = ["утро", "утром"] +_WORDS_DAY_RU = ["днём"] +_WORDS_EVENING_RU = ["вечер", "вечером"] +_WORDS_NIGHT_RU = ["ночь", "ночью"] + +_STRING_SHORT_ORDINAL_RU = invert_dict(_SHORT_ORDINAL_RU) +_STRING_LONG_ORDINAL_RU = invert_dict(_LONG_ORDINAL_RU) + + +def _convert_words_to_numbers_ru(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + text = text.lower() + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_ru(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_ru(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_ru(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_ru(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_ru_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_ru_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_ru(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_ru(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_ru(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_ru(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_ru(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_ru(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_ru(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extract_number_xx, it also depends on + extract_number_xx, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_ru(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_ru(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_ru(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data(short_scale) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + if word in word in _NEGATIVES: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + # In Russian (?) we do no use suffix (1st,2nd,..) but use point instead (1.,2.,..) + if is_numeric(word[:-1]) and \ + (word.endswith(".")): + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + word = word[:-1] + + # handle nth one + # if next_word == "one": + # would return 1 instead otherwise + # tokens[idx + 1] = Token("", idx) + # next_word = "" + + # Normalize Russian inflection of numbers (один, одна, одно,...) + if not ordinals: + word = _text_ru_inflection_normalize(word, 1) + + if word not in string_num_scale and \ + word not in _STRING_NUM_RU and \ + word not in _SUMS and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not is_fractional_ru(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + if number_words and not all([w in _NEGATIVES for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SUMS \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES: + number_words = [token] + elif prev_word in _SUMS and word in _SUMS: + number_words = [token] + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_RU: + val = _STRING_NUM_RU.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in _SUMS and val and val < 10) \ + or (prev_word in _SUMS and val and val < 100 and prev_val >= 100) \ + or all([prev_word in multiplies, val < prev_val if prev_val else False]): + val = prev_val + val + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # is this a spoken fraction? + # half cup + if val is False: + val = is_fractional_ru(word, short_scale=short_scale) + current_val = val + + # 2 fifths + if not ordinals: + next_val = is_fractional_ru(next_word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + a_pieces = word.split('/') + if look_for_fractions(a_pieces): + val = float(a_pieces[0]) / float(a_pieces[1]) + else: + if all([ + prev_word in _SUMS, + word not in _SUMS, + word not in multiplies, + current_val >= 10 + ]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx + 1:]: + if other_token.word in multiplies: + if string_num_scale[other_token.word] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data(short_scale): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_RU if short_scale \ + else _MULTIPLIES_LONG_SCALE_RU + + string_num_ordinal_ru = _STRING_SHORT_ORDINAL_RU if short_scale \ + else _STRING_LONG_ORDINAL_RU + + string_num_scale_ru = _SHORT_SCALE_RU if short_scale else _LONG_SCALE_RU + string_num_scale_ru = invert_dict(string_num_scale_ru) + string_num_scale_ru.update(generate_plurals_ru(string_num_scale_ru)) + return multiplies, string_num_ordinal_ru, string_num_scale_ru + + +def extract_number_ru(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_ru(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_ru(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Russian inflection for time: минута, минуты, минут - safe to use минута as pattern + # For day: день, дня, дней - short pattern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}(?:а|ов|у|ут|уту)?" + text = _convert_words_to_numbers_ru(text) + + for (unit_ru, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_ru) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return duration, text + + +def extract_datetime_ru(text, anchor_date=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchor_date (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize Russian inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("сегодня вечером", "вечером") \ + .replace("сегодня ночью", "ночью") + word_list = s.split() + + for idx, word in enumerate(word_list): + # word = word.replace("'s", "") + ########## + # Russian Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: тридцать первого > 31 + count_ordinals = 0 + if word == "первого": + count_ordinals = 1 # These two have different format + elif word == "третьего": + count_ordinals = 3 + elif word.endswith("ого"): + tmp = word[:-3] + tmp += "ый" + for nr, name in _ORDINAL_BASE_RU.items(): + if name == tmp: + count_ordinals = nr + + # If number is bigger than 19 check if next word is also ordinal + # and count them together + if count_ordinals > 19: + if word_list[idx + 1] == "первого": + count_ordinals += 1 # These two have different format + elif word_list[idx + 1] == "третьего": + count_ordinals += 3 + elif word_list[idx + 1].endswith("ого"): + tmp = word_list[idx + 1][:-3] + tmp += "ый" + for nr, name in _ORDINAL_BASE_RU.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized value into word + if count_ordinals > 20: + # If counted number is greater than 20, clear next word so it is not used again + word_list[idx + 1] = "" + ########## + # Remove inflection from Russian months + + word_list[idx] = word + + return word_list + + def date_found(): + return found or \ + ( + date_string != "" or + year_offset != 0 or month_offset != 0 or + day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or + min_abs or sec_offset != 0 + ) + + if text == "": + return None + + anchor_date = anchor_date or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchor_date.strftime("%w") + current_year = anchor_date.strftime("%Y") + from_flag = False + date_string = "" + has_year = False + time_qualifier = "" + + time_qualifiers_am = _WORDS_MORNING_RU + time_qualifiers_pm = ['дня', 'вечера'] + time_qualifiers_pm.extend(_WORDS_DAY_RU) + time_qualifiers_pm.extend(_WORDS_EVENING_RU) + time_qualifiers_pm.extend(_WORDS_NIGHT_RU) + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['на', 'в', 'во', 'до', 'на', 'это', + 'около', 'этот', 'через', 'спустя', 'за', 'тот'] + days = ['понедельник', 'вторник', 'среда', + 'четверг', 'пятница', 'суббота', 'воскресенье'] + months = _MONTHS_RU + recur_markers = days + ['выходные', 'викенд'] + months_short = ['янв', 'фев', 'мар', 'апр', 'май', 'июн', 'июл', 'авг', + 'сен', 'окт', 'ноя', 'дек'] + year_multiples = ["десятилетие", "век", "тысячелетие"] + + words = clean_string(text) + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_ru_inflection_normalize(word, 2) + word_prev_prev = _text_ru_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_ru_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_ru_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_ru_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + if word in _WORDS_NOW_RU and not date_string: + result_str = " ".join(words[idx + 1:]) + result_str = ' '.join(result_str.split()) + extracted_date = anchor_date.replace(microsecond=0) + return [extracted_date, result_str] + elif word_next in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_ru(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if word_next == "десятилетие": + year_offset = multiplier * 10 + elif word_next == "век": + year_offset = multiplier * 100 + elif word_next == "тысячелетие": + year_offset = multiplier * 1000 + elif word in time_qualifiers_list and preposition != "через" and word_next != "назад": + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "сегодня" and not from_flag: + day_offset = 0 + used += 1 + elif word == "завтра" and not from_flag: + day_offset = 1 + used += 1 + elif word == "послезавтра" and not from_flag: + day_offset = 2 + used += 1 + elif word == "после" and word_next == "завтра" and not from_flag: + day_offset = 2 + used += 2 + elif word == "позавчера" and not from_flag: + day_offset = -2 + used += 1 + elif word == "вчера" and not from_flag: + day_offset = -1 + used += 1 + elif (word in ["день", "дня"] and + word_next == "после" and + word_next_next == "завтра" and + not from_flag and + (not word_prev or not word_prev[0].isdigit())): + day_offset = 2 + used = 2 + elif word in ["день", "дня"] and is_numeric(word_prev) and preposition == "через": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word in ["день", "дня"] and is_numeric(word_prev) and word_next == "назад": + if word_prev and word_prev[0].isdigit(): + day_offset += -int(word_prev) + start -= 1 + used = 3 + elif word == "сегодня" and not from_flag and word_prev: + if word_prev[0].isdigit(): + day_offset += int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + day_offset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "неделя" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + day_offset = int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + day_offset = -7 + start -= 1 + used = 2 + elif word == "месяц" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + month_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + month_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + month_offset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "год" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + year_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + year_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + year_offset = -1 + start -= 1 + used = 2 + elif word_prev == "через": + year_offset = 1 + used = 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + d = days.index(word) + day_offset = (d + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev in _WORDS_NEXT_RU: + if day_offset <= 2: + day_offset += 7 + used += 1 + start -= 1 + elif word_prev in _WORDS_PREV_RU: + day_offset -= 7 + used += 1 + start -= 1 + elif word in months or word in months_short and not from_flag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + # Convert Russian months to english + date_string = _MONTHS_CONVERSION.get(m) + if word_prev and (word_prev[0].isdigit() or + (word_prev == " " and word_prev_prev[0].isdigit())): + if word_prev == " " and word_prev_prev[0].isdigit(): + date_string += " " + words[idx - 2] + used += 1 + start -= 1 + else: + date_string += " " + word_prev + start -= 1 + used += 1 + if word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + has_year = True + else: + has_year = False + + elif word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + date_string += " " + word_next_next + used += 1 + has_year = True + else: + has_year = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + valid_followups = days + months + months_short + valid_followups.append("сегодня") + valid_followups.append("завтра") + valid_followups.append("послезавтра") + valid_followups.append("вчера") + valid_followups.append("позавчера") + for followup in _WORDS_NEXT_RU: + valid_followups.append(followup) + for followup in _WORDS_PREV_RU: + valid_followups.append(followup) + for followup in _WORDS_CURRENT_RU: + valid_followups.append(followup) + for followup in _WORDS_NOW_RU: + valid_followups.append(followup) + if (word in ["до", "по", "от", "с", "со"]) and word_next in valid_followups: + used = 2 + from_flag = True + if word_next == "завтра": + day_offset += 1 + elif word_next == "послезавтра": + day_offset += 2 + elif word_next == "вчера": + day_offset -= 1 + elif word_next == "позавчера": + day_offset -= 2 + elif word_next in days: + d = days.index(word_next) + tmp_offset = (d + 1) - int(today) + used = 2 + if tmp_offset < 0: + tmp_offset += 7 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + d = days.index(word_next_next) + tmp_offset = (d + 1) - int(today) + used = 3 + if word_next in _WORDS_NEXT_RU: + if day_offset <= 2: + tmp_offset += 7 + used += 1 + start -= 1 + elif word_next in _WORDS_PREV_RU: + tmp_offset -= 7 + used += 1 + start -= 1 + day_offset += tmp_offset + if used > 0: + if start - 1 > 0 and (words[start - 1] in _WORDS_CURRENT_RU): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + day_specified = True + + # parse time + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_ru_inflection_normalize(word, 2) + word_prev_prev = _text_ru_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_ru_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_ru_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_ru_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "полдень": + hr_abs = 12 + used += 1 + elif word == "полночь": + hr_abs = 0 + used += 1 + elif word in _WORDS_MORNING_RU: + if hr_abs is None: + hr_abs = 8 + used += 1 + elif word in _WORDS_DAY_RU: + if hr_abs is None: + hr_abs = 15 + used += 1 + elif word in _WORDS_EVENING_RU: + if hr_abs is None: + hr_abs = 19 + used += 1 + if word_next != "" and word_next[0].isdigit() and ":" in word_next: + used -= 1 + elif word in _WORDS_NIGHT_RU: + if hr_abs is None: + hr_abs = 22 + # parse half an hour, quarter hour + elif word == "час" and \ + (word_prev in markers or word_prev_prev in markers): + if word_prev in ["пол", "половина"]: + min_offset = 30 + elif word_prev == "четверть": + min_offset = 15 + elif word_prev == "через": + hr_offset = 1 + else: + hr_offset = 1 + if word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_RU: + day_specified = True + words[idx - 1] = "" + used += 1 + hr_abs = -1 + min_abs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "минута" and word_prev == "через": + min_offset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "секунда" and word_prev == "через": + sec_offset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + is_time = True + str_hh = "" + str_mm = "" + remainder = "" + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + if word_next in _WORDS_EVENING_RU or word_next in _WORDS_NIGHT_RU or word_next_next in _WORDS_EVENING_RU \ + or word_next_next in _WORDS_NIGHT_RU or word_prev in _WORDS_EVENING_RU \ + or word_prev in _WORDS_NIGHT_RU or word_prev_prev in _WORDS_EVENING_RU \ + or word_prev_prev in _WORDS_NIGHT_RU or word_next_next_next in _WORDS_EVENING_RU \ + or word_next_next_next in _WORDS_NIGHT_RU: + remainder = "pm" + used += 1 + if word_prev in _WORDS_EVENING_RU or word_prev in _WORDS_NIGHT_RU: + words[idx - 1] = "" + if word_prev_prev in _WORDS_EVENING_RU or word_prev_prev in _WORDS_NIGHT_RU: + words[idx - 2] = "" + if word_next_next in _WORDS_EVENING_RU or word_next_next in _WORDS_NIGHT_RU: + used += 1 + if word_next_next_next in _WORDS_EVENING_RU or word_next_next_next in _WORDS_NIGHT_RU: + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + str_hh += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + str_mm += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + next_word = word_next.replace(".", "") + if next_word in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = next_word + used += 1 + elif next_word == "часа" and word_next_next in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = word_next_next + used += 2 + elif word_next in _WORDS_MORNING_RU: + remainder = "am" + used += 2 + elif word_next in _WORDS_DAY_RU: + remainder = "pm" + used += 2 + elif word_next in _WORDS_EVENING_RU: + remainder = "pm" + used += 2 + elif word_next == "этого" and word_next_next in _WORDS_MORNING_RU: + remainder = "am" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_DAY_RU: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_EVENING_RU: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "в" and word_next_next in _WORDS_NIGHT_RU: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif hr_abs and hr_abs != -1: + if hr_abs >= 12: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if time_qualifier != "": + military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh += str(int(str_hh) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + str_num = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + str_num += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = word_next.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечера" or + word_next == "вечера"): + str_hh = str_num + remainder = "pm" + used = 1 + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечера" or + word_next == "вечера"): + str_hh = str_num + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + word_next == "am" or + remainder == "a.m." or + word_next == "a.m." or + remainder == "ночи" or + word_next == "ночи" or + remainder == "утра" or + word_next == "утра"): + str_hh = str_num + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + word_next in recur_markers or + word_next_next in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set str_hh so that is_time == True + # when am or pm is not specified + str_hh = str_num + used = 1 + else: + if int(str_num) > 100: + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час": + used += 1 + elif ( + (word_next == "час" or + remainder == "час") and + word[0] != '0' and + # (wordPrev != "в" and wordPrev != "на") + word_prev == "через" + and + ( + int(str_num) < 100 or + int(str_num) > 2400 + )): + # ignores military time + # "in 3 hours" + hr_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "минута" or \ + remainder == "минута": + # "in 10 minutes" + min_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "секунда" \ + or remainder == "секунда": + # in 5 seconds + sec_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif int(str_num) > 100: + # military time, eg. "3300 hours" + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час" or \ + remainder == "час": + used += 1 + elif word_next and word_next[0].isdigit(): + # military time, e.g. "04 38 hours" + str_hh = str_num + str_mm = word_next + military = True + used += 1 + if (word_next_next == "час" or + remainder == "час"): + used += 1 + elif ( + word_next == "" or word_next == "час" or + ( + (word_next == "в" or word_next == "на") and + ( + word_next_next == time_qualifier + ) + ) or word_next in _WORDS_EVENING_RU or + word_next_next in _WORDS_EVENING_RU): + + str_hh = str_num + str_mm = "00" + if word_next == "час": + used += 1 + if (word_next == "в" or word_next == "на" + or word_next_next == "в" or word_next_next == "на"): + used += (1 if (word_next == + "в" or word_next == "на") else 2) + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (word_next_next and + (word_next_next in time_qualifier or + word_next_next_next in time_qualifier)): + if (word_next_next in time_qualifiers_pm or + word_next_next_next in time_qualifiers_pm): + remainder = "pm" + used += 1 + if (word_next_next in time_qualifiers_am or + word_next_next_next in time_qualifiers_am): + remainder = "am" + used += 1 + + if time_qualifier != "": + if time_qualifier in time_qualifiers_pm: + remainder = "pm" + used += 1 + + elif time_qualifier in time_qualifiers_am: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "час": + if word_next_next in ["ночи", "утра"]: + remainder = "am" + used += 1 + elif word_next_next in ["дня", "вечера"]: + remainder = "pm" + used += 1 + else: + remainder = "" + + else: + is_time = False + hh = int(str_hh) if str_hh else 0 + mm = int(str_mm) if str_mm else 0 + hh = hh + 12 if remainder == "pm" and hh < 12 else hh + hh = hh - 12 if remainder == "am" and hh >= 12 else hh + if (not military and + remainder not in ['am', 'pm', 'час', 'минута', 'секунда'] and + ((not day_specified) or 0 <= day_offset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchor_date.hour < hh or (anchor_date.hour == hh and + anchor_date.minute < mm): + pass # No modification needed + elif anchor_date.hour < hh + 12: + hh += 12 + else: + # has passed, assume the next morning + day_offset += 1 + if time_qualifier in time_qualifiers_pm and hh < 12: + hh += 12 + + if hh > 24 or mm > 59: + is_time = False + used = 0 + if is_time: + hr_abs = hh + min_abs = mm + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if word_prev == "скоро": + hr_offset = -1 + words[idx - 1] = "" + idx -= 1 + elif word_prev == "позже": + hr_offset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and word_prev in markers: + words[idx - 1] = "" + if word_prev in _WORDS_CURRENT_RU: + day_specified = True + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_RU: + day_specified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchor_date.replace(microsecond=0) + if date_string != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(date_string, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(date_string, "%B %d %Y") + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and date_string == "": + if not day_specified and anchor_date > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + for idx, word in enumerate(words): + if words[idx] == "и" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + result_str = " ".join(words) + result_str = ' '.join(result_str.split()) + return [extracted_date, result_str] + + +def is_fractional_ru(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str[-3:] in ["тые", "тых"]: # leading number is bigger than one (две четвёртые, три пятых) + input_str = input_str[-3:] + "тая" + fractions = {"целая": 1} # first four numbers have little different format + + for num in _FRACTION_STRING_RU: # Numbers from 2 to 1 hundred, more is not usually used in common speech + if num > 1: + fractions[_FRACTION_STRING_RU[num]] = num + + if input_str.lower() in fractions: + return 1.0 / fractions[input_str.lower()] + return False + + +def extract_numbers_ru(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_ru(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +class RussianNormalizer(Normalizer): + with open(resolve_resource_file("text/ru-ru/normalize.json"), encoding='utf8') as f: + _default_config = json.load(f) + + +def normalize_ru(text, remove_articles=True): + """ Russian string normalization """ + return RussianNormalizer().normalize(text, remove_articles) + + +def _text_ru_inflection_normalize(word, arg): + """ + Russian Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + if word in ["тысяч", "тысячи"]: + return "тысяча" + + if arg == 1: # _extract_whole_number_with_text_ru + if word in ["одна", "одним", "одно", "одной"]: + return "один" + if word == "две": + return "два" + if word == "пару": + return "пара" + + elif arg == 2: # extract_datetime_ru + if word in ["часа", "часам", "часами", "часов", "часу"]: + return "час" + if word in ["минут", "минутам", "минутами", "минуту", "минуты"]: + return "минута" + if word in ["секунд", "секундам", "секундами", "секунду", "секунды"]: + return "секунда" + if word in ["дней", "дни"]: + return "день" + if word in ["неделе", "недели", "недель"]: + return "неделя" + if word in ["месяца", "месяцев"]: + return "месяц" + if word in ["года", "лет"]: + return "год" + if word in _WORDS_MORNING_RU: + return "утром" + if word in ["полудне", "полудня"]: + return "полдень" + if word in _WORDS_EVENING_RU: + return "вечером" + if word in _WORDS_NIGHT_RU: + return "ночь" + if word in ["викенд", "выходным", "выходных"]: + return "выходные" + if word in ["столетие", "столетий", "столетия"]: + return "век" + + # Week days + if word in ["среду", "среды"]: + return "среда" + if word in ["пятницу", "пятницы"]: + return "пятница" + if word in ["субботу", "субботы"]: + return "суббота" + + # Months + if word in ["марта", "марте"]: + return "март" + if word in ["мае", "мая"]: + return "май" + if word in ["августа", "августе"]: + return "август" + + if word[-2:] in ["ле", "ля", "не", "ня", "ре", "ря"]: + tmp = word[:-1] + "ь" + for name in _MONTHS_RU: + if name == tmp: + return name + + return word diff --git a/lingua_franca/lang/parse_sl.py b/lingua_franca/lang/parse_sl.py new file mode 100644 index 0000000..c5e26ab --- /dev/null +++ b/lingua_franca/lang/parse_sl.py @@ -0,0 +1 @@ +# TODO implement parsing function diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py new file mode 100644 index 0000000..0216411 --- /dev/null +++ b/lingua_franca/lang/parse_sv.py @@ -0,0 +1,922 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta + +from lingua_franca.time import now_local + +from .parse_common import (is_numeric, look_for_fractions, Normalizer, + tokenize, Token) + + +def _find_numbers_in_text(tokens): + """Finds duration related numbers in texts and makes a list of mappings. + + The mapping will be for number to token that created it, if no number was + created from the token the mapping will be from None to the token. + + The function is optimized to generate data that can be parsed to a duration + so it returns the list in reverse order to make the "size" (minutes/hours/ + etc.) come first and the related numbers afterwards. + + Args: + tokens: Tokens to parse + + Returns: + list of (number, token) tuples + """ + parts = [] + for tok in tokens: + res = extract_number_sv(tok.word) + if res: + parts.insert(0, (res, tok)) + # Special case for quarter of an hour + if tok.word == 'kvart': + parts.insert(0, (None, Token('timmar', index=-1))) + elif tok.word in ['halvtimme', 'halvtimma']: + parts.insert(0, (30, tok)) + parts.insert(0, (None, Token('minuter', index=-1))) + else: + parts.insert(0, (None, tok)) + return parts + + +def _combine_adjacent_numbers(number_map): + """Combine adjacent numbers through multiplication. + + Walks through a number map and joins adjasent numbers to handle cases + such as "en halvtimme" (one half hour). + + Returns: + (list): simplified number_map + """ + simplified = [] + skip = False + for i in range(len(number_map) - 1): + if skip: + skip = False + continue + if number_map[i][0] and number_map[i + 1][0]: + combined_number = number_map[i][0] * number_map[i + 1][0] + combined_tokens = (number_map[i][1], number_map[i + 1][1]) + simplified.append((combined_number, combined_tokens)) + skip = True + else: + simplified.append((number_map[i][0], (number_map[i][1],))) + + if not skip: + simplified.append((number_map[-1][0], (number_map[-1][1],))) + return simplified + + +def extract_duration_sv(text): + """ + Convert an swedish phrase into a number of seconds. + + The function handles durations from seconds up to days. + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + tokens = tokenize(text) + number_tok_map = _find_numbers_in_text(tokens) + # Combine adjacent numbers + simplified = _combine_adjacent_numbers(number_tok_map) + + states = { + 'days': 0, + 'hours': 0, + 'minutes': 0, + 'seconds': 0 + } + + # Parser state, mapping words that should set the parser to collect + # numbers to a specific time "size" + state_words = { + 'days': ('dygn', 'dag', 'dagar', 'dags'), + 'hours': ('timmar', 'timme', 'timma', 'timmes', 'timmas'), + 'minutes': ('minuter', 'minuters', 'minut', 'minuts'), + 'seconds': ('sekunder', 'sekunders', 'sekund', 'sekunds') + } + binding_words = ('och') + + consumed = [] + state = None + valid = False + + for num, toks in simplified: + if state and num: + states[state] += num + consumed.extend(toks) + valid = True # If a state field got set this is valid duration + elif num is None: + for s in state_words: + if toks[0].word in state_words[s]: + state = s + consumed.extend(toks) + break + else: + if toks[0].word not in binding_words: + state = None + + td = timedelta(**states) + remainder = ' '.join([t.word for t in tokens if t not in consumed]) + return (td, remainder) if valid else None + + +def extract_number_sv(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API + # compatibility reasons. + text = text.lower() + aWords = text.split() + and_pass = False + valPreAnd = False + val = False + count = 0 + while count < len(aWords): + word = aWords[count] + if is_numeric(word): + val = float(word) + elif word == "första": + val = 1 + elif word == "andra": + val = 2 + elif word == "tredje": + val = 3 + elif word == "fjärde": + val = 4 + elif word == "femte": + val = 5 + elif word == "sjätte": + val = 6 + elif is_fractional_sv(word): + val = is_fractional_sv(word) + else: + if word == "en": + val = 1 + if word == "ett": + val = 1 + elif word == "två": + val = 2 + elif word == "tre": + val = 3 + elif word == "fyra": + val = 4 + elif word == "fem": + val = 5 + elif word == "sex": + val = 6 + elif word == "sju": + val = 7 + elif word == "åtta": + val = 8 + elif word == "nio": + val = 9 + elif word == "tio": + val = 10 + if val: + if count < (len(aWords) - 1): + wordNext = aWords[count + 1] + else: + wordNext = "" + valNext = is_fractional_sv(wordNext) + + if valNext: + val = val * valNext + aWords[count + 1] = "" + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + elif and_pass: + # added to value, quit here + val = valPreAnd + break + else: + count += 1 + continue + + aWords[count] = "" + + if and_pass: + aWords[count - 1] = '' # remove "och" + val += valPreAnd + elif count + 1 < len(aWords) and aWords[count + 1] == 'och': + and_pass = True + valPreAnd = val + val = False + count += 2 + continue + elif count + 2 < len(aWords) and aWords[count + 2] == 'och': + and_pass = True + valPreAnd = val + val = False + count += 3 + continue + + break + + return val or False + + +def extract_datetime_sv(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + """ + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' den ', ' ').replace(' en ', ' ') + wordList = s.split() + for idx, word in enumerate(wordList): + word = word.replace("'s", "") + + ordinals = ["rd", "st", "nd", "th"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['morgon', 'förmiddag', 'eftermiddag', 'kväll'] + markers = ['på', 'i', 'den här', 'kring', 'efter'] + days = ['måndag', 'tisdag', 'onsdag', 'torsdag', + 'fredag', 'lördag', 'söndag'] + months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', + 'juli', 'augusti', 'september', 'oktober', 'november', + 'december'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', + 'sept', 'oct', 'nov', 'dec'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "idag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "imorgon" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "morgondagen" or word == "morgondagens" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "övermorgon" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dagar": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "vecka" or word == "veckor" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "nästa": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "förra": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "månad" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "nästa": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "förra": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "år" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "nästa": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "förra": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "nästa": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "förra": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("idag") + validFollowups.append("imorgon") + validFollowups.append("nästa") + validFollowups.append("förra") + validFollowups.append("nu") + if (word == "från" or word == "efter") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "imorgon": + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "nästa": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "förra": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "denna": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "middag": + hrAbs = 12 + used += 1 + elif word == "midnatt": + hrAbs = 0 + used += 1 + elif word == "morgon": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "förmiddag": + if not hrAbs: + hrAbs = 10 + used += 1 + elif word == "eftermiddag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "kväll": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif wordPrev in markers or wordPrevPrev in markers: + if word == "halvtimme" or word == "halvtimma": + minOffset = 30 + elif word == "kvart": + minOffset = 15 + elif word == "timme" or word == "timma": + hrOffset = 1 + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "tonight": + remainder = "pm" + used += 1 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "morning": + remainder = "am" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "afternoon": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "evening": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "morning": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "afternoon": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "evening": + remainder = "pm" + used += 2 + elif wordNext == "this" and wordNextNext == "morning": + remainder = "am" + used = 2 + elif wordNext == "this" and wordNextNext == "afternoon": + remainder = "pm" + used = 2 + elif wordNext == "this" and wordNextNext == "evening": + remainder = "pm" + used = 2 + elif wordNext == "at" and wordNextNext == "night": + if strHH > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "evening" or + timeQualifier == "afternoon"): + strHH += 12 + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "pm" or wordNext == "p.m.": + strHH = strNum + remainder = "pm" + used = 1 + elif wordNext == "am" or wordNext == "a.m.": + strHH = strNum + remainder = "am" + used = 1 + elif ( + int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hours": + used += 1 + elif ( + wordNext == "hours" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minutes": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconds": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hours": + used += 1 + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hours": + used += 1 + elif ( + wordNext == "" or wordNext == "o'clock" or + ( + wordNext == "in" and + ( + wordNextNext == "the" or + wordNextNext == timeQualifier + ) + )): + strHH = word + strMM = 00 + if wordNext == "o'clock": + used += 1 + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + if (wordNextNext and + wordNextNext in timeQualifier or + (words[words.index(wordNextNext) + 1] and + words[words.index(wordNextNext) + 1] in + timeQualifier)): + if (wordNextNext == "afternoon" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "afternoon")): + remainder = "pm" + if (wordNextNext == "evening" or + (len(words) > + (words.index(wordNextNext) + 1) and + words[words.index( + wordNextNext) + 1] == "evening")): + remainder = "pm" + if (wordNextNext == "morning" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "morning")): + remainder = "am" + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "o" or wordPrev == "oh": + words[words.index(wordPrev)] = "" + + if wordPrev == "early": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "late": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + temp = datetime.strptime(datestr, "%B %d") + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "and" and words[idx - 1] == "" and words[ + idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def is_fractional_sv(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('ars', -3): + input_str = input_str[:len(input_str) - 3] # e.g. "femtedelar" + if input_str.endswith('ar', -2): + input_str = input_str[:len(input_str) - 2] # e.g. "femtedelar" + if input_str.endswith('a', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "halva" + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "halva" + + aFrac = ["hel", "halv", "tredjedel", "fjärdedel", "femtedel", "sjättedel", + "sjundedel", "åttondel", "niondel", "tiondel", "elftedel", + "tolftedel"] + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 1) + if input_str == "kvart": + return 1.0 / 4 + if input_str == "trekvart": + return 3.0 / 4 + + return False + + +def normalize_sv(text, remove_articles=True): + """ English string normalization """ + + words = text.split() # this also removed extra spaces + normalized = '' + for word in words: + # Convert numbers into digits, e.g. "two" -> "2" + if word == 'en': + word = 'ett' + textNumbers = ["noll", "ett", "två", "tre", "fyra", "fem", "sex", + "sju", "åtta", "nio", "tio", "elva", "tolv", + "tretton", "fjorton", "femton", "sexton", + "sjutton", "arton", "nitton", "tjugo"] + if word in textNumbers: + word = str(textNumbers.index(word)) + + normalized += " " + word + + return normalized[1:] # strip the initial space + + +class SwedishNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py new file mode 100644 index 0000000..698c42b --- /dev/null +++ b/lingua_franca/parse.py @@ -0,0 +1,269 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from difflib import SequenceMatcher +from warnings import warn +from lingua_franca.time import now_local +from lingua_franca.internal import populate_localized_function_dict, \ + get_active_langs, get_full_lang_code, get_primary_lang_code, \ + get_default_lang, localized_function, _raise_unsupported_language + +_REGISTERED_FUNCTIONS = ("extract_numbers", + "extract_number", + "extract_duration", + "extract_datetime", + "normalize", + "get_gender", + "is_fractional", + "is_ordinal") + +populate_localized_function_dict("parse", langs=get_active_langs()) + + +def fuzzy_match(x: str, against: str) -> float: + """Perform a 'fuzzy' comparison between two strings. + + Returns: + match percentage -- 1.0 for perfect match, + down to 0.0 for no match at all. + """ + return SequenceMatcher(None, x, against).ratio() + + +def match_one(query, choices): + """ + Find best match from a list or dictionary given an input + + Args: + query (str): string to test + choices (list): list or dictionary of choices + + Returns: + tuple: (best match, score) + """ + if isinstance(choices, dict): + _choices = list(choices.keys()) + elif isinstance(choices, list): + _choices = choices + else: + raise ValueError('a list or dict of choices must be provided') + + best = (_choices[0], fuzzy_match(query, _choices[0])) + for c in _choices[1:]: + score = fuzzy_match(query, c) + if score > best[1]: + best = (c, score) + + if isinstance(choices, dict): + return (choices[best[0]], best[1]) + else: + return best + + +@localized_function() +def extract_numbers(text, short_scale=True, ordinals=False, lang=''): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + list: list of extracted numbers as floats, or empty list if none found + """ + + +@localized_function() +def extract_number(text, short_scale=True, ordinals=False, lang=''): + """Takes in a string and extracts a number. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + (int, float or False): The number extracted or False if the input + text contains no numbers + """ + + +@localized_function() +def extract_duration(text, lang=''): + """ Convert an english phrase into a number of seconds + + Convert things like: + + * "10 minute" + * "2 and a half hours" + * "3 days 8 hours 10 minutes and 49 seconds" + + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + ``(300, "set a timer for")``. + + Args: + text (str): string containing a duration + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + + +@localized_function() +def extract_datetime(text, anchorDate=None, lang='', default_time=None): + """ + Extracts date and time information from a sentence. Parses many of the + common ways that humans express dates and times, including relative dates + like "5 days from today", "tomorrow', and "Tuesday". + + Vague terminology are given arbitrary values, like: + - morning = 8 AM + - afternoon = 3 PM + - evening = 7 PM + + If a time isn't supplied or implied, the function defaults to 12 AM + + Args: + text (str): the text to be interpreted + anchorDate (:obj:`datetime`, optional): the date to be used for + relative dating (for example, what does "tomorrow" mean?). + Defaults to the current local date/time. + lang (str): the BCP-47 code for the language to use, None uses default + default_time (datetime.time): time to use if none was found in + the input string. + + Returns: + [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date + as a datetime object in the local timezone. + 'leftover_string' is the original phrase with all date and time + related keywords stripped out. See examples for further + clarification + + Returns 'None' if no date or time related text is found. + + Examples: + + >>> extract_datetime( + ... "What is the weather like the day after tomorrow?", + ... datetime(2017, 6, 30, 00, 00) + ... ) + [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like'] + + >>> extract_datetime( + ... "Set up an appointment 2 weeks from Sunday at 5 pm", + ... datetime(2016, 2, 19, 00, 00) + ... ) + [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment'] + + >>> extract_datetime( + ... "Set up an appointment", + ... datetime(2016, 2, 19, 00, 00) + ... ) + None + """ + + +@localized_function() +def normalize(text, lang='', remove_articles=True): + """Prepare a string for parsing + + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + + Args: + text (str): the string to normalize + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + remove_articles (bool): whether to remove articles (like 'a', or + 'the'). True by default. + + Returns: + (str): The normalized string. + """ + + +@localized_function() +def get_gender(word, context="", lang=''): + """ Guess the gender of a word + + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + + +@localized_function() +def is_fractional(input_str, short_scale=True, lang=''): + """ + This function takes the given text and checks if it is a fraction. + Used by most of the number exractors. + + Will return False on phrases that *contain* a fraction. Only detects + exact matches. To pull a fraction from a string, see extract_number() + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + """ + + +@localized_function() +def is_ordinal(input_str, lang=''): + """ + This function takes the given text and checks if it is an ordinal number. + + Args: + input_str (str): the string to check if ordinal + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + (bool) or (float): False if not an ordinal, otherwise the number + corresponding to the ordinal + """ diff --git a/lingua_franca/res/text/ca-es/and.word b/lingua_franca/res/text/ca-es/and.word new file mode 100644 index 0000000..0ddf2ba --- /dev/null +++ b/lingua_franca/res/text/ca-es/and.word @@ -0,0 +1 @@ +i diff --git a/lingua_franca/res/text/ca-es/date_time.json b/lingua_franca/res/text/ca-es/date_time.json new file mode 100644 index 0000000..da75a05 --- /dev/null +++ b/lingua_franca/res/text/ca-es/date_time.json @@ -0,0 +1,130 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^2\\d$", "format": "vint-i-{x}"}, + "5": {"match": "^[3-9]\\d$", "format": "{x0}-{x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "{x_in_x00}-cent"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00}-cents"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d\\d$", "format": "mil"}, + "2": {"match": "^11\\d\\d$", "format": "mil cent"}, + "3": {"match": "^1[2-9]\\d\\d$", "format": "mil {x_in_x00}-cents"}, + "4": {"match": "^[2-9]0\\d{2}$", "format": "{x_in_x000} mil"}, + "5": {"match": "^[2-9]1\\d{2}$", "format": "{x_in_x000} mil cent"}, + "6": {"match": "^[2-9][2-9]\\d{2}$", "format": "{x_in_x000} mil {x_in_x00}-cents"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "a.C." + }, + "date_format": { + "date_full": "{weekday}, {day} de {month} de {formatted_year}", + "date_full_no_year": "{weekday}, {day} de {month}", + "date_full_no_year_month": "{weekday}, dia {day}", + "today": "avui", + "tomorrow": "demà", + "yesterday": "ahir" + }, + "date_time_format": { + "date_time": "{formatted_date} a {formatted_time}" + }, + "weekday": { + "0": "dilluns", + "1": "dimarts", + "2": "dimecres", + "3": "dijous", + "4": "divendres", + "5": "dissabte", + "6": "diumenge" + }, + "date": { + "1": "primer", + "2": "dos", + "3": "tres", + "4": "quatre", + "5": "cinc", + "6": "sis", + "7": "set", + "8": "vuit", + "9": "nou", + "10": "deu", + "11": "onze", + "12": "dotze", + "13": "tretze", + "14": "catorze", + "15": "quinze", + "16": "setze", + "17": "disset", + "18": "divuit", + "19": "dinou", + "20": "vint", + "21": "vint-i-u", + "22": "vint-i-dos", + "23": "vint-i-tres", + "24": "vint-i-quatre", + "25": "vint-i-cinc", + "26": "vint-i-sis", + "27": "vint-i-set", + "28": "vint-i-vuit", + "29": "vint-i-nou", + "30": "trenta", + "31": "trenta-u" + }, + "month": { + "1": "gener", + "2": "febrer", + "3": "març", + "4": "abril", + "5": "maig", + "6": "juny", + "7": "juliol", + "8": "agost", + "9": "setembre", + "10": "octubre", + "11": "novembre", + "12": "desembre" + }, + "number": { + "0": "zero", + "1": "u", + "2": "dos", + "3": "tres", + "4": "quatre", + "5": "cinc", + "6": "sis", + "7": "set", + "8": "vuit", + "9": "nou", + "10": "deu", + "11": "onze", + "12": "dotze", + "13": "tretze", + "14": "catorze", + "15": "quinze", + "16": "setze", + "17": "disset", + "18": "divuit", + "19": "dinou", + "20": "vint", + "30": "trenta", + "40": "quaranta", + "50": "cinquanta", + "60": "seixanta", + "70": "setanta", + "80": "vuitanta", + "90": "noranta" + } +} diff --git a/lingua_franca/res/text/ca-es/date_time_test.json b/lingua_franca/res/text/ca-es/date_time_test.json new file mode 100644 index 0000000..ea1087d --- /dev/null +++ b/lingua_franca/res/text/ca-es/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "u a.C." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deu a.C." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "noranta-dos a.C." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vuit-cents tres" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vuit-cents onze" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quatre-cents cinquanta-quatre" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cinc" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil dotze" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil quaranta-sis" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil vuit-cents set" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil set-cents disset" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil nou-cents vuitanta-vuit"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil nou"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil divuit"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil vint-i-u"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil trenta"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dos mil cent" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil cent vint a.C." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil dos-cents quaranta-u a.C." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinc mil dos-cents" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cent" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dos mil cent" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "dimarts, trenta-u de gener de dos mil disset"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer de dos mil divuit"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "diumenge, dia quatre"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "demà"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "avui"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "ahir"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer de dos mil divuit"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "dimarts, trenta-u de gener de dos mil disset a la una i vint-i-dos de la tarda"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "dimarts, trenta-u de gener de dos mil disset a les tretze i vint-i-dos"} + } +} diff --git a/lingua_franca/res/text/ca-es/day.word b/lingua_franca/res/text/ca-es/day.word new file mode 100644 index 0000000..4e3b8d2 --- /dev/null +++ b/lingua_franca/res/text/ca-es/day.word @@ -0,0 +1 @@ +dia diff --git a/lingua_franca/res/text/ca-es/days.word b/lingua_franca/res/text/ca-es/days.word new file mode 100644 index 0000000..42e1e27 --- /dev/null +++ b/lingua_franca/res/text/ca-es/days.word @@ -0,0 +1 @@ +dies diff --git a/lingua_franca/res/text/ca-es/hour.word b/lingua_franca/res/text/ca-es/hour.word new file mode 100644 index 0000000..cb26009 --- /dev/null +++ b/lingua_franca/res/text/ca-es/hour.word @@ -0,0 +1 @@ +hora diff --git a/lingua_franca/res/text/ca-es/hours.word b/lingua_franca/res/text/ca-es/hours.word new file mode 100644 index 0000000..badb15f --- /dev/null +++ b/lingua_franca/res/text/ca-es/hours.word @@ -0,0 +1 @@ +hores diff --git a/lingua_franca/res/text/ca-es/minute.word b/lingua_franca/res/text/ca-es/minute.word new file mode 100644 index 0000000..7a715f8 --- /dev/null +++ b/lingua_franca/res/text/ca-es/minute.word @@ -0,0 +1 @@ +minut diff --git a/lingua_franca/res/text/ca-es/minutes.word b/lingua_franca/res/text/ca-es/minutes.word new file mode 100644 index 0000000..3a1f23b --- /dev/null +++ b/lingua_franca/res/text/ca-es/minutes.word @@ -0,0 +1 @@ +minuts diff --git a/lingua_franca/res/text/ca-es/normalize.json b/lingua_franca/res/text/ca-es/normalize.json new file mode 100644 index 0000000..76fbdc2 --- /dev/null +++ b/lingua_franca/res/text/ca-es/normalize.json @@ -0,0 +1,109 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": true, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "catorze": "14", + "cent": "100", + "cents": "100", + "cinc": "5", + "cinc-centes": "500", + "cinc-cents": "500", + "cinquanta": "50", + "deu": "10", + "dinou": "19", + "setze": "16", + "disset": "17", + "dihuit": "18", + "divuit": "18", + "dos": "2", + "dos-centes": "200", + "dos-cents": "200", + "dotze": "12", + "dues": "2", + "dues-centes": "200", + "huitanta": "80", + "huit": "8", + "huit-centes": "800", + "huit-cents": "800", + "mil": "1000", + "milió": "1000000", + "nou": "9", + "nou-centes": "900", + "nou-cents": "900", + "noranta": "90", + "onze": "11", + "primer": "1", + "primera": "1", + "quaranta": "40", + "quatre": "4", + "quatre-centes": "400", + "quatre-cents": "400", + "quinze": "15", + "segon": "2", + "segona": "2", + "seixanta": "60", + "set": "7", + "set-centes": "700", + "set-cents": "700", + "setanta": "70", + "sis": "6", + "sis-centes": "600", + "sis-cents": "600", + "tercer": "3", + "trenta": "30", + "tres": "3", + "tres-centes": "300", + "tres-cents": "300", + "tretze": "13", + "u": "1", + "un": "1", + "una": "1", + "vint": "20", + "vuitanta": "80", + "vuit": "8", + "vuit-centes": "800", + "vuit-cents": "800", + "zero": "0" + }, + "stopwords": [ + "de", + "del", + "dels", + "ell", + "ella", + "ells", + "elles", + "jo", + "i", + "al", + "dins la", + "a la", + "nosaltres", + "dins el", + "para", + "aquest", + "aquesta", + "aquests", + "aquestes", + "aquell", + "aquella", + "aquells", + "aquelles", + "que" + ], + "articles": [ + "el", + "la", + "l", + "els", + "les", + "los" + ] +} diff --git a/lingua_franca/res/text/ca-es/or.word b/lingua_franca/res/text/ca-es/or.word new file mode 100644 index 0000000..13e7564 --- /dev/null +++ b/lingua_franca/res/text/ca-es/or.word @@ -0,0 +1 @@ +o diff --git a/lingua_franca/res/text/ca-es/second.word b/lingua_franca/res/text/ca-es/second.word new file mode 100644 index 0000000..82c0a03 --- /dev/null +++ b/lingua_franca/res/text/ca-es/second.word @@ -0,0 +1 @@ +segon diff --git a/lingua_franca/res/text/ca-es/seconds.word b/lingua_franca/res/text/ca-es/seconds.word new file mode 100644 index 0000000..0c38c75 --- /dev/null +++ b/lingua_franca/res/text/ca-es/seconds.word @@ -0,0 +1 @@ +segons diff --git a/lingua_franca/res/text/cs-cz/and.word b/lingua_franca/res/text/cs-cz/and.word new file mode 100644 index 0000000..2e65efe --- /dev/null +++ b/lingua_franca/res/text/cs-cz/and.word @@ -0,0 +1 @@ +a \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/date_time.json b/lingua_franca/res/text/cs-cz/date_time.json new file mode 100644 index 0000000..e4430fd --- /dev/null +++ b/lingua_franca/res/text/cs-cz/date_time.json @@ -0,0 +1,129 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} sto"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tisíc"}, + "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} sto"}, + "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} sto"}, + "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, + "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "b.c." + }, + "date_format": { + "date_full": "{weekday}, {month} {day}, {formatted_year}", + "date_full_no_year": "{weekday}, {month} {day}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "dnes", + "tomorrow": "zítra", + "yesterday": "včera" + }, + "date_time_format": { + "date_time": "{formatted_date} v {formatted_time}" + }, + "weekday": { + "0": "pondělí", + "1": "úterý", + "2": "středa", + "3": "čtvrtek", + "4": "pátek", + "5": "sobota", + "6": "neděle" + }, + "date": { + "1": "prvního", + "2": "druhého", + "3": "třetího", + "4": "čtvrtého", + "5": "pátého", + "6": "šestého", + "7": "sedmého", + "8": "osmého", + "9": "devátého", + "10": "desátého", + "11": "jedenáctého", + "12": "dvanáctého", + "13": "třináctého", + "14": "čtrnáctého", + "15": "patnáctého", + "16": "šestnáctého", + "17": "sedmnáctého", + "18": "osmnáctého", + "19": "devatenáctého", + "20": "dvacátého", + "21": "dvacátého-prvního", + "22": "dvacátého-druhého", + "23": "dvacátého-třetího", + "24": "dvacátého-čtvrtého", + "25": "dvacátého-pátého", + "26": "dvacátého-šestého", + "27": "dvacátého-sedmého", + "28": "dvacátého-osmého", + "29": "dvacátého-devátého", + "30": "třicátého", + "31": "třicátého-prvního" + }, + "month": { + "1": "leden", + "2": "únor", + "3": "březen", + "4": "duben", + "5": "květen", + "6": "červen", + "7": "červenec", + "8": "srpen", + "9": "září", + "10": "říjen", + "11": "listopad", + "12": "prosinec" + }, + "number": { + "0": "nula", + "1": "jedna", + "2": "dva", + "3": "tři", + "4": "čtyři", + "5": "pět", + "6": "šest", + "7": "sedm", + "8": "osm", + "9": "devět", + "10": "deset", + "11": "jedenáct", + "12": "dvanáct", + "13": "třináct", + "14": "čtrnáct", + "15": "patnáct", + "16": "šestnáct", + "17": "sedmnáct", + "18": "osmnáct", + "19": "devatenáct", + "20": "dvacet", + "30": "třicet", + "40": "čtyřicet", + "50": "padesát", + "60": "šedesát", + "70": "sedmdesát", + "80": "osmdesát", + "90": "devadesát" + } +} diff --git a/lingua_franca/res/text/cs-cz/date_time_test.json b/lingua_franca/res/text/cs-cz/date_time_test.json new file mode 100644 index 0000000..97459e6 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "jedna b.c." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deset b.c." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "devadesát dva b.c." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osm sto tři" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osm sto jedenáct" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "čtyři sto padesát čtyři" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "jedna tisíc pět" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deset dvanáct" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deset čtyřicet šest" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osmnáct sedm" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "sedmnáct sedmnáct" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "devatenáct osmdesát osm"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisíc devět"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet osmnáct"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet dvacet jedna"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet třicet"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dvacet jedna sto" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "jedna tisíc" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisíc" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "třicet jedna dvacet b.c." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "třicet dva čtyřicet jedna b.c." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "padesát dva sto" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "jedenáct sto" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dvacet jedna sto" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého, dvacet osmnáct"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "neděle, čtvrtého"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "zítra"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "dnes"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "včera"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého, dvacet osmnáct"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct v jedna dvacet dva p.m."}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct v třináct dvacet dva"} + } +} diff --git a/lingua_franca/res/text/cs-cz/day.word b/lingua_franca/res/text/cs-cz/day.word new file mode 100644 index 0000000..aaf7116 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/day.word @@ -0,0 +1 @@ +den \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/days.word b/lingua_franca/res/text/cs-cz/days.word new file mode 100644 index 0000000..ed43d6f --- /dev/null +++ b/lingua_franca/res/text/cs-cz/days.word @@ -0,0 +1 @@ +dní \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/hour.word b/lingua_franca/res/text/cs-cz/hour.word new file mode 100644 index 0000000..e8a0565 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/hour.word @@ -0,0 +1 @@ +hodina \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/hours.word b/lingua_franca/res/text/cs-cz/hours.word new file mode 100644 index 0000000..c62f391 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/hours.word @@ -0,0 +1 @@ +hodiny \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/minute.word b/lingua_franca/res/text/cs-cz/minute.word new file mode 100644 index 0000000..02810df --- /dev/null +++ b/lingua_franca/res/text/cs-cz/minute.word @@ -0,0 +1 @@ +minuta \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/minutes.word b/lingua_franca/res/text/cs-cz/minutes.word new file mode 100644 index 0000000..5181d68 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/minutes.word @@ -0,0 +1 @@ +minuty \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/normalize.json b/lingua_franca/res/text/cs-cz/normalize.json new file mode 100644 index 0000000..9493b78 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/normalize.json @@ -0,0 +1,46 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "nula": "0", + "jedna": "1", + "dva": "2", + "dvě": "2", + "tři": "3", + "čtyři": "4", + "pět": "5", + "šest": "6", + "sedm": "7", + "sedum": "7", + "osm": "8", + "osum": "8", + "devět": "9", + "deset": "10", + "jedenáct": "11", + "dvanáct": "12", + "třináct": "13", + "čtrnáct": "14", + "patnáct": "15", + "šestnáct": "16", + "sedmnáct": "17", + "osmnáct": "18", + "devatenáct": "19", + "dvacet": "20", + "třicet": "30", + "čtyřicet": "40", + "padesát": "50", + "šedesát": "60", + "sedmdesát": "70", + "osmdesát": "80", + "devadesát": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/or.word b/lingua_franca/res/text/cs-cz/or.word new file mode 100644 index 0000000..38c14ca --- /dev/null +++ b/lingua_franca/res/text/cs-cz/or.word @@ -0,0 +1 @@ +nebo \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/second.word b/lingua_franca/res/text/cs-cz/second.word new file mode 100644 index 0000000..ef210e2 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/second.word @@ -0,0 +1 @@ +sekunda \ No newline at end of file diff --git a/lingua_franca/res/text/cs-cz/seconds.word b/lingua_franca/res/text/cs-cz/seconds.word new file mode 100644 index 0000000..91bf586 --- /dev/null +++ b/lingua_franca/res/text/cs-cz/seconds.word @@ -0,0 +1 @@ +sekundy \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/and.word b/lingua_franca/res/text/da-dk/and.word new file mode 100644 index 0000000..ae9a4e6 --- /dev/null +++ b/lingua_franca/res/text/da-dk/and.word @@ -0,0 +1 @@ +og \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/date_time.json b/lingua_franca/res/text/da-dk/date_time.json new file mode 100644 index 0000000..390f620 --- /dev/null +++ b/lingua_franca/res/text/da-dk/date_time.json @@ -0,0 +1,132 @@ +{ + "decade_format": { + "1": {"match": "^1$", "format": "et"}, + "2": {"match": "^\\d$", "format": "{x}"}, + "3": {"match": "^1\\d$", "format": "{xx}"}, + "4": {"match": "^\\d0$", "format": "{x0}"}, + "5": {"match": "^[2-9]\\d$", "format": "{x} og {x0}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "et hundred"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^1[1-9]\\d{2}$", "format": "{xx_in_xx00} hundred"}, + "2": {"match": "^1\\d{3}$", "format": "et tusind"}, + "3": {"match": "^\\d{4}$", "format": "{x_in_x000} tusind"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} og {formatted_decade} {bc}"}, + "4": {"match": "^(1\\d00)|([2-9]000)$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} og {formatted_decade} {bc}"}, + "6": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_hundreds} og {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "f.kr." + }, + "date_format": { + "date_full": "{weekday}, den {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, den {day} {month}", + "date_full_no_year_month": "{weekday}, den {day}", + "today": "i dag", + "tomorrow": "i morgen", + "yesterday": "i går" + }, + "date_time_format": { + "date_time": "{formatted_date} klokken {formatted_time}" + }, + "weekday": { + "0": "mandag", + "1": "tirsdag", + "2": "onsdag", + "3": "torsdag", + "4": "fredag", + "5": "lørdag", + "6": "søndag" + }, + "date": { + "1": "første", + "2": "anden", + "3": "tredie", + "4": "fjerde", + "5": "femte", + "6": "sjette", + "7": "syvende", + "8": "ottende", + "9": "ninende", + "10": "tiende", + "11": "elvte", + "12": "tolvte", + "13": "trettende", + "14": "fjortende", + "15": "femtende", + "16": "sekstende", + "17": "syttende", + "18": "attende", + "19": "nittende", + "20": "tyvende", + "21": "en og tyvende", + "22": "to og tyvende", + "23": "tre og tyvende", + "24": "fire og tyvende", + "25": "fem og tyvende", + "26": "seks og tyvende", + "27": "syv og tyvende", + "28": "otte og tyvende", + "29": "ni og tyvende", + "30": "tredivte", + "31": "en og tredivte" + }, + "month": { + "1": "januar", + "2": "februar", + "3": "marts", + "4": "april", + "5": "maj", + "6": "juni", + "7": "juli", + "8": "august", + "9": "september", + "10": "oktober", + "11": "november", + "12": "december" + }, + "number": { + "0": "nul", + "1": "en", + "2": "to", + "3": "tre", + "4": "fire", + "5": "fem", + "6": "seks", + "7": "syv", + "8": "otte", + "9": "ni", + "10": "ti", + "11": "elve", + "12": "tolv", + "13": "tretten", + "14": "fjorten", + "15": "femten", + "16": "seksten", + "17": "sytten", + "18": "atten", + "19": "nitten", + "20": "tyve", + "30": "tredive", + "40": "fyrre", + "50": "halvtreds", + "60": "treds", + "70": "halvfjerds", + "80": "firs", + "90": "halvfems", + "100": "hundrede", + "1000": "tusind", + "2000": "to tusind" + } + +} diff --git a/lingua_franca/res/text/da-dk/date_time_test.json b/lingua_franca/res/text/da-dk/date_time_test.json new file mode 100644 index 0000000..7705399 --- /dev/null +++ b/lingua_franca/res/text/da-dk/date_time_test.json @@ -0,0 +1,32 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "to tusind og sytten"}, + "2": {"datetime_param": "1984, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitten hundred og fire og firs"}, + "3": {"datetime_param": "1906, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitten hundred og seks"}, + "4": {"datetime_param": "1802, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "atten hundred og to" }, + "5": {"datetime_param": "806, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "otte hundred og seks" }, + "6": {"datetime_param": "1800, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "atten hundred" }, + "7": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et" }, + "8": {"datetime_param": "103, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et hundred og tre" }, + "9": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et tusind" }, + "10": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "to tusind" }, + "11": {"datetime_param": "99, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ni og halvfems f.kr." }, + "12": {"datetime_param": "5, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "fem f.kr." }, + "13": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tre tusind et hundred og tyve f.kr." }, + "14": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tre tusind to hundred og en og fyrre f.kr." } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tirsdag, den en og tredivte januar, to tusind og sytten"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde februar, to tusind og atten"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde februar"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "i morgen"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "i dag"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "i går"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "søndag, den fjerde februar"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "søndag, den fjerde februar, to tusind og atten"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tirsdag, den en og tredivte januar, to tusind og sytten klokken et toogtyve om eftermiddagen"} + } +} \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/day.word b/lingua_franca/res/text/da-dk/day.word new file mode 100644 index 0000000..73e686a --- /dev/null +++ b/lingua_franca/res/text/da-dk/day.word @@ -0,0 +1 @@ +dag \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/days.word b/lingua_franca/res/text/da-dk/days.word new file mode 100644 index 0000000..1b5c2d6 --- /dev/null +++ b/lingua_franca/res/text/da-dk/days.word @@ -0,0 +1 @@ +dage \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/hour.word b/lingua_franca/res/text/da-dk/hour.word new file mode 100644 index 0000000..0082886 --- /dev/null +++ b/lingua_franca/res/text/da-dk/hour.word @@ -0,0 +1 @@ +time \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/hours.word b/lingua_franca/res/text/da-dk/hours.word new file mode 100644 index 0000000..036d1ab --- /dev/null +++ b/lingua_franca/res/text/da-dk/hours.word @@ -0,0 +1 @@ +timer \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/minute.word b/lingua_franca/res/text/da-dk/minute.word new file mode 100644 index 0000000..4b98366 --- /dev/null +++ b/lingua_franca/res/text/da-dk/minute.word @@ -0,0 +1 @@ +minut \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/minutes.word b/lingua_franca/res/text/da-dk/minutes.word new file mode 100644 index 0000000..caf1b02 --- /dev/null +++ b/lingua_franca/res/text/da-dk/minutes.word @@ -0,0 +1 @@ +minuter \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/or.word b/lingua_franca/res/text/da-dk/or.word new file mode 100644 index 0000000..d99648a --- /dev/null +++ b/lingua_franca/res/text/da-dk/or.word @@ -0,0 +1 @@ +eller \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/second.word b/lingua_franca/res/text/da-dk/second.word new file mode 100644 index 0000000..300f8e5 --- /dev/null +++ b/lingua_franca/res/text/da-dk/second.word @@ -0,0 +1 @@ +sekund \ No newline at end of file diff --git a/lingua_franca/res/text/da-dk/seconds.word b/lingua_franca/res/text/da-dk/seconds.word new file mode 100644 index 0000000..aa5fc12 --- /dev/null +++ b/lingua_franca/res/text/da-dk/seconds.word @@ -0,0 +1 @@ +sekunder \ No newline at end of file diff --git a/lingua_franca/res/text/de-de/and.word b/lingua_franca/res/text/de-de/and.word new file mode 100644 index 0000000..ee61ac9 --- /dev/null +++ b/lingua_franca/res/text/de-de/and.word @@ -0,0 +1 @@ +und diff --git a/lingua_franca/res/text/de-de/date_time.json b/lingua_franca/res/text/de-de/date_time.json new file mode 100644 index 0000000..8a5aace --- /dev/null +++ b/lingua_franca/res/text/de-de/date_time.json @@ -0,0 +1,136 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x} und {x0}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "hundert"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00} hundert"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d\\d$", "format": "tausend"}, + "2": {"match": "^\\d0\\d{2}$", "format": "{x_in_x000} tausend"}, + "3": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundert"}, + "4": {"match": "^\\d{2}00$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, + "5": {"match": "^\\d0\\d\\d$", "format": "{x_in_x000} tausend"}, + "6": {"match": "^1\\d{3}$", "format": "{xx_in_xx00}"}, + "7": {"match": "^\\d{4}$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^1$", "format": "eins {bc}"}, + "2": {"match": "^\\d{1}?$", "format": "{formatted_decade} {bc}"}, + "3": {"match": "^\\d{2}?$", "format": "{formatted_decade} {bc}"}, + "4": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "5": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "7": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "8": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, + "9": {"match": "^1[2-9]\\d{2}$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, + "10": {"match": "^1\\d{3}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "11": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "v.d.Z." + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "heute", + "tomorrow": "morgen", + "yesterday": "gestern" + }, + "date_time_format": { + "date_time": "{formatted_date} um {formatted_time}" + }, + "weekday": { + "0": "Montag", + "1": "Dienstag", + "2": "Mittwoch", + "3": "Donnerstag", + "4": "Freitag", + "5": "Samstag", + "6": "Sonntag" + }, + "date": { + "1": "erster", + "2": "zweiter", + "3": "dritter", + "4": "vierter", + "5": "fünfter", + "6": "sechster", + "7": "siebter", + "8": "achter", + "9": "neunter", + "10": "zehnter", + "11": "elfter", + "12": "zwölfter", + "13": "dreizehnter", + "14": "vierzehnter", + "15": "fünfzehnter", + "16": "sechzehnter", + "17": "siebzehnter", + "18": "achtzehnter", + "19": "neunzehnter", + "20": "zwanzigster", + "21": "einundzwanzigster", + "22": "zweiundzwanzigster", + "23": "dreiundzwanzigster", + "24": "vierundzwanzigster", + "25": "fünfundzwanzigster", + "26": "sechsundzwanzigster", + "27": "siebenundzwanzigster", + "28": "achtundzwanzigster", + "29": "neunundzwanzigster", + "30": "dreißigster", + "31": "einunddreißigster" + }, + "month": { + "1": "Januar", + "2": "Februar", + "3": "März", + "4": "April", + "5": "Mai", + "6": "Juni", + "7": "Juli", + "8": "August", + "9": "September", + "10": "Oktober", + "11": "November", + "12": "Dezember" + }, + "number": { + "0": "null", + "1": "ein", + "2": "zwei", + "3": "drei", + "4": "vier", + "5": "fünf", + "6": "sechs", + "7": "sieben", + "8": "acht", + "9": "neun", + "10": "zehn", + "11": "elf", + "12": "zwölf", + "13": "dreizehn", + "14": "vierzehn", + "15": "fünfzehn", + "16": "sechzehn", + "17": "siebzehn", + "18": "achtzehn", + "19": "neunzehn", + "20": "zwanzig", + "30": "dreißig", + "40": "vierzig", + "50": "fünfzig", + "60": "sechzig", + "70": "siebzig", + "80": "achtzig", + "90": "neunzig" + } +} diff --git a/lingua_franca/res/text/de-de/date_time_test.json b/lingua_franca/res/text/de-de/date_time_test.json new file mode 100644 index 0000000..1bea06f --- /dev/null +++ b/lingua_franca/res/text/de-de/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "eins v.d.Z." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zehn v.d.Z." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zwei und neunzig v.d.Z." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert drei" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert elf" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vier hundert vier und fünfzig" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend fünf" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend zwölf" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend sechs und vierzig" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "achtzehn hundert sieben" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "siebzehn hundert siebzehn" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "neunzehn hundert acht und achtzig"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend neun"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend achtzehn"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend ein und zwanzig"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend dreißig"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tausend" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend ein hundert zwanzig v.d.Z." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend zwei hundert ein und vierzig v.d.Z." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fünf tausend zwei hundert" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elf hundert" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "morgen"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "heute"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "gestern"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um ein Uhr zweiundzwanzig nachmittags"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um dreizehn Uhr zweiundzwanzig"} + } +} diff --git a/lingua_franca/res/text/de-de/day.word b/lingua_franca/res/text/de-de/day.word new file mode 100644 index 0000000..1aa7c25 --- /dev/null +++ b/lingua_franca/res/text/de-de/day.word @@ -0,0 +1 @@ +Tag diff --git a/lingua_franca/res/text/de-de/days.word b/lingua_franca/res/text/de-de/days.word new file mode 100644 index 0000000..4c2fd37 --- /dev/null +++ b/lingua_franca/res/text/de-de/days.word @@ -0,0 +1 @@ +Tage diff --git a/lingua_franca/res/text/de-de/hour.word b/lingua_franca/res/text/de-de/hour.word new file mode 100644 index 0000000..7e69c57 --- /dev/null +++ b/lingua_franca/res/text/de-de/hour.word @@ -0,0 +1 @@ +Stunde diff --git a/lingua_franca/res/text/de-de/hours.word b/lingua_franca/res/text/de-de/hours.word new file mode 100644 index 0000000..3c728ba --- /dev/null +++ b/lingua_franca/res/text/de-de/hours.word @@ -0,0 +1 @@ +Stunden diff --git a/lingua_franca/res/text/de-de/minute.word b/lingua_franca/res/text/de-de/minute.word new file mode 100644 index 0000000..de47666 --- /dev/null +++ b/lingua_franca/res/text/de-de/minute.word @@ -0,0 +1 @@ +Minute diff --git a/lingua_franca/res/text/de-de/minutes.word b/lingua_franca/res/text/de-de/minutes.word new file mode 100644 index 0000000..bdc262e --- /dev/null +++ b/lingua_franca/res/text/de-de/minutes.word @@ -0,0 +1 @@ +Minuten diff --git a/lingua_franca/res/text/de-de/or.word b/lingua_franca/res/text/de-de/or.word new file mode 100644 index 0000000..2fbe532 --- /dev/null +++ b/lingua_franca/res/text/de-de/or.word @@ -0,0 +1 @@ +oder diff --git a/lingua_franca/res/text/de-de/second.word b/lingua_franca/res/text/de-de/second.word new file mode 100644 index 0000000..e658c21 --- /dev/null +++ b/lingua_franca/res/text/de-de/second.word @@ -0,0 +1 @@ +Sekunde diff --git a/lingua_franca/res/text/de-de/seconds.word b/lingua_franca/res/text/de-de/seconds.word new file mode 100644 index 0000000..2c54f29 --- /dev/null +++ b/lingua_franca/res/text/de-de/seconds.word @@ -0,0 +1 @@ +Sekunden diff --git a/lingua_franca/res/text/en-au/date_time.json b/lingua_franca/res/text/en-au/date_time.json new file mode 100644 index 0000000..951688c --- /dev/null +++ b/lingua_franca/res/text/en-au/date_time.json @@ -0,0 +1,129 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} thousand"}, + "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, + "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, + "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, + "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, + "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "b.c." + }, + "date_format": { + "date_full": "{weekday}, {month} {day}, {formatted_year}", + "date_full_no_year": "{weekday}, {month} {day}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "today", + "tomorrow": "tomorrow", + "yesterday": "yesterday" + }, + "date_time_format": { + "date_time": "{formatted_date} at {formatted_time}" + }, + "weekday": { + "0": "monday", + "1": "tuesday", + "2": "wednesday", + "3": "thursday", + "4": "friday", + "5": "saturday", + "6": "sunday" + }, + "date": { + "1": "first", + "2": "second", + "3": "third", + "4": "fourth", + "5": "fifth", + "6": "sixth", + "7": "seventh", + "8": "eighth", + "9": "ninth", + "10": "tenth", + "11": "eleventh", + "12": "twelfth", + "13": "thirteenth", + "14": "fourteenth", + "15": "fifteenth", + "16": "sixteenth", + "17": "seventeenth", + "18": "eighteenth", + "19": "nineteenth", + "20": "twentieth", + "21": "twenty-first", + "22": "twenty-second", + "23": "twenty-third", + "24": "twenty-fourth", + "25": "twenty-fifth", + "26": "twenty-sixth", + "27": "twenty-seventh", + "28": "twenty-eighth", + "29": "twenty-ninth", + "30": "thirtieth", + "31": "thirty-first" + }, + "month": { + "1": "january", + "2": "february", + "3": "march", + "4": "april", + "5": "may", + "6": "june", + "7": "july", + "8": "august", + "9": "september", + "10": "october", + "11": "november", + "12": "december" + }, + "number": { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "30": "thirty", + "40": "forty", + "50": "fifty", + "60": "sixty", + "70": "seventy", + "80": "eighty", + "90": "ninety" + } +} \ No newline at end of file diff --git a/lingua_franca/res/text/en-au/date_time_test.json b/lingua_franca/res/text/en-au/date_time_test.json new file mode 100644 index 0000000..af83e02 --- /dev/null +++ b/lingua_franca/res/text/en-au/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "one b.c." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ten b.c." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ninety two b.c." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred three" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred eleven" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "four hundred fifty four" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "one thousand five" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten twelve" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten forty six" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eighteen oh seven" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "seventeen seventeen" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nineteen eighty eight"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand nine"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty eighteen"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty twenty one"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty thirty"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "one thousand" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty one twenty b.c." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty two forty one b.c." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fifty two hundred" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "eleven hundred" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tuesday, january thirty-first, twenty seventeen"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "sunday, fourth"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "tomorrow"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "today"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "yesterday"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tuesday, january thirty-first, twenty seventeen at one twenty two p.m."}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tuesday, january thirty-first, twenty seventeen at thirteen twenty two"} + } +} diff --git a/lingua_franca/res/text/en-us/and.word b/lingua_franca/res/text/en-us/and.word new file mode 100644 index 0000000..c51107c --- /dev/null +++ b/lingua_franca/res/text/en-us/and.word @@ -0,0 +1 @@ +and \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/date_time.json b/lingua_franca/res/text/en-us/date_time.json new file mode 100644 index 0000000..c9ca605 --- /dev/null +++ b/lingua_franca/res/text/en-us/date_time.json @@ -0,0 +1,129 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} thousand"}, + "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, + "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, + "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, + "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, + "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "b.c." + }, + "date_format": { + "date_full": "{weekday}, {month} {day}, {formatted_year}", + "date_full_no_year": "{weekday}, {month} {day}", + "date_full_no_year_month": "{weekday}, the {day}", + "today": "today", + "tomorrow": "tomorrow", + "yesterday": "yesterday" + }, + "date_time_format": { + "date_time": "{formatted_date} at {formatted_time}" + }, + "weekday": { + "0": "monday", + "1": "tuesday", + "2": "wednesday", + "3": "thursday", + "4": "friday", + "5": "saturday", + "6": "sunday" + }, + "date": { + "1": "first", + "2": "second", + "3": "third", + "4": "fourth", + "5": "fifth", + "6": "sixth", + "7": "seventh", + "8": "eighth", + "9": "ninth", + "10": "tenth", + "11": "eleventh", + "12": "twelfth", + "13": "thirteenth", + "14": "fourteenth", + "15": "fifteenth", + "16": "sixteenth", + "17": "seventeenth", + "18": "eighteenth", + "19": "nineteenth", + "20": "twentieth", + "21": "twenty-first", + "22": "twenty-second", + "23": "twenty-third", + "24": "twenty-fourth", + "25": "twenty-fifth", + "26": "twenty-sixth", + "27": "twenty-seventh", + "28": "twenty-eighth", + "29": "twenty-ninth", + "30": "thirtieth", + "31": "thirty-first" + }, + "month": { + "1": "january", + "2": "february", + "3": "march", + "4": "april", + "5": "may", + "6": "june", + "7": "july", + "8": "august", + "9": "september", + "10": "october", + "11": "november", + "12": "december" + }, + "number": { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "30": "thirty", + "40": "forty", + "50": "fifty", + "60": "sixty", + "70": "seventy", + "80": "eighty", + "90": "ninety" + } +} diff --git a/lingua_franca/res/text/en-us/date_time_test.json b/lingua_franca/res/text/en-us/date_time_test.json new file mode 100644 index 0000000..ffde77b --- /dev/null +++ b/lingua_franca/res/text/en-us/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "one b.c." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ten b.c." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ninety two b.c." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred three" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred eleven" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "four hundred fifty four" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "one thousand five" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten twelve" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten forty six" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eighteen oh seven" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "seventeen seventeen" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nineteen eighty eight"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand nine"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty eighteen"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty twenty one"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty thirty"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "one thousand" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty one twenty b.c." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty two forty one b.c." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fifty two hundred" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "eleven hundred" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tuesday, january thirty-first, twenty seventeen"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "sunday, the fourth"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "tomorrow"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "today"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "yesterday"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tuesday, january thirty-first, twenty seventeen at one twenty two p.m."}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tuesday, january thirty-first, twenty seventeen at thirteen twenty two"} + } +} diff --git a/lingua_franca/res/text/en-us/day.word b/lingua_franca/res/text/en-us/day.word new file mode 100644 index 0000000..0c303a4 --- /dev/null +++ b/lingua_franca/res/text/en-us/day.word @@ -0,0 +1 @@ +day \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/days.word b/lingua_franca/res/text/en-us/days.word new file mode 100644 index 0000000..5eb8de3 --- /dev/null +++ b/lingua_franca/res/text/en-us/days.word @@ -0,0 +1 @@ +days \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/hour.word b/lingua_franca/res/text/en-us/hour.word new file mode 100644 index 0000000..a13960e --- /dev/null +++ b/lingua_franca/res/text/en-us/hour.word @@ -0,0 +1 @@ +hour \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/hours.word b/lingua_franca/res/text/en-us/hours.word new file mode 100644 index 0000000..62c6dec --- /dev/null +++ b/lingua_franca/res/text/en-us/hours.word @@ -0,0 +1 @@ +hours \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/minute.word b/lingua_franca/res/text/en-us/minute.word new file mode 100644 index 0000000..50bc2f2 --- /dev/null +++ b/lingua_franca/res/text/en-us/minute.word @@ -0,0 +1 @@ +minute \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/minutes.word b/lingua_franca/res/text/en-us/minutes.word new file mode 100644 index 0000000..cde6523 --- /dev/null +++ b/lingua_franca/res/text/en-us/minutes.word @@ -0,0 +1 @@ +minutes \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/normalize.json b/lingua_franca/res/text/en-us/normalize.json new file mode 100644 index 0000000..4126c02 --- /dev/null +++ b/lingua_franca/res/text/en-us/normalize.json @@ -0,0 +1,141 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "I'd": "I would", + "I'll": "I will", + "I'm": "I am", + "I've": "I have", + "ain't": "is not", + "aren't": "are not", + "can't": "can not", + "could've": "could have", + "couldn't": "could not", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "gonna": "going to", + "gotta": "got to", + "hadn't": "had not", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he would", + "he'll": "he will", + "he's": "he is", + "how'd": "how did", + "how'll": "how will", + "how's": "how is", + "isn't": "is not", + "it'd": "it would", + "it'll": "it will", + "it's": "it is", + "might've": "might have", + "mightn't": "might not", + "must've": "must have", + "mustn't": "must not", + "needn't": "need not", + "oughtn't": "ought not", + "shan't": "shall not", + "she'd": "she would", + "she'll": "she will", + "she's": "she is", + "should've": "should have", + "shouldn't": "should not", + "somebody's": "somebody is", + "someone'd": "someone would", + "someone'll": "someone will", + "someone's": "someone is", + "that'd": "that would", + "that'll": "that will", + "that's": "that is", + "there'd": "there would", + "there're": "there are", + "there's": "there is", + "they'd": "they would", + "they'll": "they will", + "they're": "they are", + "they've": "they have", + "wasn't": "was not", + "we'd": "we would", + "we'll": "we will", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'd": "what did", + "what'll": "what will", + "what're": "what are", + "what's": "what is", + "what've": "what have", + "whats": "what is", + "when'd": "when did", + "when's": "when is", + "where'd": "where did", + "where's": "where is", + "where've": "where have", + "who'd": "who would", + "who'd've": "who would have", + "who'll": "who will", + "who're": "who are", + "who's": "who is", + "who've": "who have", + "why'd": "why did", + "why're": "why are", + "why's": "why is", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'ain't": "you are not", + "y'aint": "you are not", + "y'all": "you all", + "ya'll": "you all", + "you'd": "you would", + "you'd've": "you would have", + "you'll": "you will", + "you're": "you are", + "you've": "you have" + }, + "word_replacements": {}, + "number_replacements": { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + "eleven": "11", + "twelve": "12", + "thirteen": "13", + "fourteen": "14", + "fifteen": "15", + "sixteen": "16", + "seventeen": "17", + "eighteen": "18", + "nineteen": "19", + "twenty": "20", + "thirty": "30", + "forty": "40", + "fifty": "50", + "sixty": "60", + "seventy": "70", + "eighty": "80", + "ninety": "90" + }, + "stopwords": [], + "articles": [ + "the", + "a", + "an" + ] +} \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/or.word b/lingua_franca/res/text/en-us/or.word new file mode 100644 index 0000000..c4fced5 --- /dev/null +++ b/lingua_franca/res/text/en-us/or.word @@ -0,0 +1 @@ +or \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/second.word b/lingua_franca/res/text/en-us/second.word new file mode 100644 index 0000000..2147e41 --- /dev/null +++ b/lingua_franca/res/text/en-us/second.word @@ -0,0 +1 @@ +second \ No newline at end of file diff --git a/lingua_franca/res/text/en-us/seconds.word b/lingua_franca/res/text/en-us/seconds.word new file mode 100644 index 0000000..729866f --- /dev/null +++ b/lingua_franca/res/text/en-us/seconds.word @@ -0,0 +1 @@ +seconds \ No newline at end of file diff --git a/lingua_franca/res/text/es-es/day.word b/lingua_franca/res/text/es-es/day.word new file mode 100644 index 0000000..1f65386 --- /dev/null +++ b/lingua_franca/res/text/es-es/day.word @@ -0,0 +1 @@ +día diff --git a/lingua_franca/res/text/es-es/days.word b/lingua_franca/res/text/es-es/days.word new file mode 100644 index 0000000..3b1c676 --- /dev/null +++ b/lingua_franca/res/text/es-es/days.word @@ -0,0 +1 @@ +días diff --git a/lingua_franca/res/text/es-es/hour.word b/lingua_franca/res/text/es-es/hour.word new file mode 100644 index 0000000..cb26009 --- /dev/null +++ b/lingua_franca/res/text/es-es/hour.word @@ -0,0 +1 @@ +hora diff --git a/lingua_franca/res/text/es-es/hours.word b/lingua_franca/res/text/es-es/hours.word new file mode 100644 index 0000000..49f065d --- /dev/null +++ b/lingua_franca/res/text/es-es/hours.word @@ -0,0 +1 @@ +horas diff --git a/lingua_franca/res/text/es-es/minute.word b/lingua_franca/res/text/es-es/minute.word new file mode 100644 index 0000000..9b63882 --- /dev/null +++ b/lingua_franca/res/text/es-es/minute.word @@ -0,0 +1 @@ +minuto diff --git a/lingua_franca/res/text/es-es/minutes.word b/lingua_franca/res/text/es-es/minutes.word new file mode 100644 index 0000000..5028337 --- /dev/null +++ b/lingua_franca/res/text/es-es/minutes.word @@ -0,0 +1 @@ +minutos diff --git a/lingua_franca/res/text/es-es/second.word b/lingua_franca/res/text/es-es/second.word new file mode 100644 index 0000000..9c41ac6 --- /dev/null +++ b/lingua_franca/res/text/es-es/second.word @@ -0,0 +1 @@ +segundo diff --git a/lingua_franca/res/text/es-es/seconds.word b/lingua_franca/res/text/es-es/seconds.word new file mode 100644 index 0000000..f9955ce --- /dev/null +++ b/lingua_franca/res/text/es-es/seconds.word @@ -0,0 +1 @@ +segundos diff --git a/lingua_franca/res/text/fa-ir/and.word b/lingua_franca/res/text/fa-ir/and.word new file mode 100644 index 0000000..438fc2d --- /dev/null +++ b/lingua_franca/res/text/fa-ir/and.word @@ -0,0 +1 @@ +و \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/date_time.json b/lingua_franca/res/text/fa-ir/date_time.json new file mode 100644 index 0000000..1a43989 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/date_time.json @@ -0,0 +1,180 @@ +{ + "decade_format": { + "1": { + "match": "^\\d$", + "format": "{x}" + }, + "2": { + "match": "^1\\d$", + "format": "{xx}" + }, + "3": { + "match": "^\\d0$", + "format": "{x0}" + }, + "4": { + "match": "^[2-9]\\d$", + "format": "{x0} {x}" + }, + "default": "{number}" + }, + "hundreds_format": { + "1": { + "match": "^\\d{3}$", + "format": "{x_in_x00} hundred" + }, + "default": "{number}" + }, + "thousand_format": { + "1": { + "match": "^\\d00\\d$", + "format": "{x_in_x000} thousand" + }, + "2": { + "match": "^1\\d00$", + "format": "{xx_in_xx00} hundred" + }, + "3": { + "match": "^\\d{2}00$", + "format": "{x0_in_x000} {x_in_x00} hundred" + }, + "4": { + "match": "^(1\\d{3})|(\\d0\\d{2})$", + "format": "{xx_in_xx00}" + }, + "5": { + "match": "^\\d{4}$", + "format": "{x0_in_x000} {x_in_x00}" + }, + "default": "{number}" + }, + "year_format": { + "1": { + "match": "^\\d\\d?$", + "format": "{formatted_decade} {bc}" + }, + "2": { + "match": "^\\d00$", + "format": "{formatted_hundreds} {bc}" + }, + "3": { + "match": "^\\d{3}$", + "format": "{formatted_hundreds} {formatted_decade} {bc}" + }, + "4": { + "match": "^\\d{2}00$", + "format": "{formatted_thousand} {bc}" + }, + "5": { + "match": "^\\d00\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "6": { + "match": "^\\d{2}0\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "7": { + "match": "^\\d{4}$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "default": "{year} {bc}", + "bc": "بعد از میلاد" + }, + "date_format": { + "date_full": "{weekday}, {day} {month} {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "امروز", + "tomorrow": "فردا", + "yesterday": "دیروز" + }, + "date_time_format": { + "date_time": "{formatted_date} ساعت {formatted_time}" + }, + "weekday": { + "0": "دوشنبه", + "1": "سه شنبه", + "2": "چهارشنبه", + "3": "پنج شنبه", + "4": "جمعه", + "5": "شنبه", + "6": "یکشنبه" + }, + "date": { + "1": "یکم", + "2": "دوم", + "3": "سوم", + "4": "چهارم", + "5": "پنجم", + "6": "ششم", + "7": "هفتم", + "8": "هشتم", + "9": "نهم", + "10": "دهم", + "11": "یازدهم", + "12": "دوازدهم", + "13": "سیزدهم", + "14": "چهاردهم", + "15": "پونزدهم", + "16": "شونزدهم", + "17": "هیفدهم", + "18": "هیجدهم", + "19": "نوزدهم", + "20": "بیستم", + "21": "بیست و یکم", + "22": "بیست و دوم", + "23": "بیست و سوم", + "24": "بیست و چهارم", + "25": "بیست و پنجم", + "26": "بیست و ششم", + "27": "بیست و هفتم", + "28": "بیست و هشتم", + "29": "بیست و نهم", + "30": "سیم", + "31": "سی و یکم" + }, + "month": { + "1": "ژانویه", + "2": "فوریه", + "3": "مارس", + "4": "آوریل", + "5": "مه", + "6": "جون", + "7": "جولای", + "8": "آگوست", + "9": "سپتامبر", + "10": "اکتبر", + "11": "نوامبر", + "12": "دسامبر" + }, + "number": { + "0": "صفر", + "1": "یک", + "2": "دو", + "3": "سه", + "4": "چهار", + "5": "پنج", + "6": "شش", + "7": "هفت", + "8": "هشت", + "9": "نه", + "10": "ده", + "11": "یازده", + "12": "دوازده", + "13": "سیزده", + "14": "چهارده", + "15": "پونزده", + "16": "شونزده", + "17": "هیفده", + "18": "هیجده", + "19": "نوزده", + "20": "بیست", + "30": "سی", + "40": "چهل", + "50": "پنجاه", + "60": "شصت", + "70": "هفتاد", + "80": "هشتاد", + "90": "نود" + } +} diff --git a/lingua_franca/res/text/fa-ir/date_time_test.json b/lingua_franca/res/text/fa-ir/date_time_test.json new file mode 100644 index 0000000..72321e3 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/date_time_test.json @@ -0,0 +1,36 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "یک بعد از میلاد" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ده بعد از میلاد" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده دوازده" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده چهل و شش" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیجده صفر هفت" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیفده هیفده" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "نوزده هشتاد و هشت"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و نه"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست هیجده"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست بیست و یک"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست سی"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "دو هزار و صد" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و یک بیست بعد از میلاد" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و دو چهل و یک بعد از میلاد" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "پنجاه و دو هزار" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "فردا"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "امروز"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "دیروز"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت یک و بیست و دو دقیقه بعد از ظهر"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت سیزده و بیست و دو دقیقه"} + } +} diff --git a/lingua_franca/res/text/fa-ir/day.word b/lingua_franca/res/text/fa-ir/day.word new file mode 100644 index 0000000..dfc15b7 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/day.word @@ -0,0 +1 @@ +روز \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/days.word b/lingua_franca/res/text/fa-ir/days.word new file mode 100644 index 0000000..dfc15b7 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/days.word @@ -0,0 +1 @@ +روز \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/hour.word b/lingua_franca/res/text/fa-ir/hour.word new file mode 100644 index 0000000..3f2b7b1 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/hour.word @@ -0,0 +1 @@ +ساعت \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/hours.word b/lingua_franca/res/text/fa-ir/hours.word new file mode 100644 index 0000000..3f2b7b1 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/hours.word @@ -0,0 +1 @@ +ساعت \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/minute.word b/lingua_franca/res/text/fa-ir/minute.word new file mode 100644 index 0000000..1e9a05d --- /dev/null +++ b/lingua_franca/res/text/fa-ir/minute.word @@ -0,0 +1 @@ +دقیقه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/minutes.word b/lingua_franca/res/text/fa-ir/minutes.word new file mode 100644 index 0000000..1e9a05d --- /dev/null +++ b/lingua_franca/res/text/fa-ir/minutes.word @@ -0,0 +1 @@ +دقیقه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/or.word b/lingua_franca/res/text/fa-ir/or.word new file mode 100644 index 0000000..aa43ee0 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/or.word @@ -0,0 +1 @@ +یا \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/second.word b/lingua_franca/res/text/fa-ir/second.word new file mode 100644 index 0000000..3d2bee6 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/second.word @@ -0,0 +1 @@ +ثانیه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/seconds.word b/lingua_franca/res/text/fa-ir/seconds.word new file mode 100644 index 0000000..3d2bee6 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/seconds.word @@ -0,0 +1 @@ +ثانیه \ No newline at end of file diff --git a/lingua_franca/res/text/fr-fr/date_time.json b/lingua_franca/res/text/fr-fr/date_time.json new file mode 100644 index 0000000..4344920 --- /dev/null +++ b/lingua_franca/res/text/fr-fr/date_time.json @@ -0,0 +1,147 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^\\d0$", "format": "{x0}"}, + "3": {"match": "^[2-6]1$", "format": "{x0}-et-un"}, + "4": {"match": "^[2-6|8]\\d$", "format": "{x0}-{x}"}, + "5": {"match": "^\\d{2}$", "format": "{xx}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{1,2}$", "format": "{formatted_decade}"}, + "2": {"match": "^100$", "format": "cent"}, + "3": {"match": "^\\d00$", "format": "{x_in_x00}-cents"}, + "4": {"match": "^1\\d{2}$", "format": "cent-{formatted_decade}"}, + "5": {"match": "^\\d{3}$", "format": "{x_in_x00}-cent-{formatted_decade}"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^1000$", "format": "mille"}, + "2": {"match": "^\\d000$", "format": "{x_in_x000}-mille"}, + "3": {"match": "^1\\d{3}$", "format": "mille-{formatted_hundreds}"}, + "4": {"match": "^\\d{4}$", "format": "{x_in_x000}-mille-{formatted_hundreds}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^[1-9]\\d{3}$", "format": "{formatted_thousand} {bc}"}, + "default": "{year} {bc}", + "bc": "avant Jésus Christ " + }, + "date_format": { + "date_full": "{weekday} {day} {month} {formatted_year}", + "date_full_no_year": "{weekday} {day} {month}", + "date_full_no_year_month": "{weekday} {day}", + "today": "aujourd'hui", + "tomorrow": "demain", + "yesterday": "hier" + }, + "date_time_format": { + "date_time": "{formatted_date} {formatted_time}" + }, + "weekday": { + "0": "lundi", + "1": "mardi", + "2": "mercredi", + "3": "jeudi", + "4": "vendredi", + "5": "samedi", + "6": "dimanche" + }, + "date": { + "1": "premier", + "2": "deux", + "3": "trois", + "4": "quatre", + "5": "cinq", + "6": "six", + "7": "sept", + "8": "huit", + "9": "neuf", + "10": "dix", + "11": "onze", + "12": "douze", + "13": "treize", + "14": "quatorze", + "15": "quinze", + "16": "seize", + "17": "dix-sept", + "18": "dix-huit", + "19": "dix-neuf", + "20": "vingt", + "21": "vingt-et-un", + "22": "vingt-deux", + "23": "vingt-trois", + "24": "vingt-quatre", + "25": "vingt-cinq", + "26": "vingt-six", + "27": "vingt-sept", + "28": "vingt-huit", + "29": "vingt-neuf", + "30": "trente", + "31": "trente-et-un" + }, + "month": { + "1": "janvier", + "2": "février", + "3": "mars", + "4": "avril", + "5": "mai", + "6": "juin", + "7": "juillet", + "8": "août", + "9": "septembre", + "10": "octobe", + "11": "novembre", + "12": "décembre" + }, + "number": { + "0": "zéro", + "1": "un", + "2": "deux", + "3": "trois", + "4": "quatre", + "5": "cinq", + "6": "six", + "7": "sept", + "8": "huit", + "9": "neuf", + "10": "dix", + "11": "onze", + "12": "douze", + "13": "treize", + "14": "quatorze", + "15": "quinze", + "16": "seize", + "17": "dix-sept", + "18": "dix-huit", + "19": "dix-neuf", + "20": "vingt", + "30": "trente", + "40": "quarante", + "50": "cinquante", + "60": "soixante", + "70": "soixante-dix", + "71": "soixante-et-onze", + "72": "soixante-douze", + "73": "soixante-treize", + "74": "soixante-quatorze", + "75": "soixante-quinze", + "76": "soixante-seize", + "77": "soixante-dix-sept", + "78": "soixante-dix-huit", + "79": "soixante-dix-neuf", + "80": "quatre-vingt", + "90": "quatre-vingt-dix", + "91": "quatre-vingt-onze", + "92": "quatre-vingt-douze", + "93": "quatre-vingt-treize", + "94": "quatre-vingt-quatorze", + "95": "quatre-vingt-quinze", + "96": "quatre-vingt-seize", + "97": "quatre-vingt-dix-sept", + "98": "quatre-vingt-dix-huit", + "99": "quatre-vingt-dix-neuf" + } +} diff --git a/lingua_franca/res/text/fr-fr/date_time_test.json b/lingua_franca/res/text/fr-fr/date_time_test.json new file mode 100644 index 0000000..448f219 --- /dev/null +++ b/lingua_franca/res/text/fr-fr/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "un avant Jésus Christ" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dix avant Jésus Christ" }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "quatre-vingt-douze avant Jésus Christ" }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "huit-cent-trois" }, + "5": {"datetime_param": "111, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "cent-onze" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quatre-cent-cinquante-quatre" }, + "7": {"datetime_param": "2005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cinq" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-douze" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-quarante-six" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-huit-cent-sept" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-sept-cent-dix-sept" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-neuf-cent-quatre-vingt-huit"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-neuf"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-dix-huit"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-vingt-et-un"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-trente"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cent" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trois-mille-cent-vingt avant Jésus Christ" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trois-mille-deux-cent-quarante-et-un avant Jésus Christ" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinq-mille-deux-cents" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-cent" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cent" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "dimanche quatre février deux-mille-dix-huit"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "dimanche quatre février"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "dimanche quatre"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "demain"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "aujourd'hui"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "hier"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "dimanche quatre février"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "dimanche quatre février deux-mille-dix-huit"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept une heure vingt-deux de l'après-midi"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept treize heures vingt-deux"} + } +} diff --git a/lingua_franca/res/text/fr-fr/day.word b/lingua_franca/res/text/fr-fr/day.word new file mode 100644 index 0000000..3e1393b --- /dev/null +++ b/lingua_franca/res/text/fr-fr/day.word @@ -0,0 +1 @@ +jour diff --git a/lingua_franca/res/text/fr-fr/days.word b/lingua_franca/res/text/fr-fr/days.word new file mode 100644 index 0000000..6a0300a --- /dev/null +++ b/lingua_franca/res/text/fr-fr/days.word @@ -0,0 +1 @@ +jours diff --git a/lingua_franca/res/text/fr-fr/hour.word b/lingua_franca/res/text/fr-fr/hour.word new file mode 100644 index 0000000..cfa09b2 --- /dev/null +++ b/lingua_franca/res/text/fr-fr/hour.word @@ -0,0 +1 @@ +heure diff --git a/lingua_franca/res/text/fr-fr/hours.word b/lingua_franca/res/text/fr-fr/hours.word new file mode 100644 index 0000000..5afb41b --- /dev/null +++ b/lingua_franca/res/text/fr-fr/hours.word @@ -0,0 +1 @@ +heures diff --git a/lingua_franca/res/text/fr-fr/minute.word b/lingua_franca/res/text/fr-fr/minute.word new file mode 100644 index 0000000..cfcd96c --- /dev/null +++ b/lingua_franca/res/text/fr-fr/minute.word @@ -0,0 +1 @@ +minute diff --git a/lingua_franca/res/text/fr-fr/minutes.word b/lingua_franca/res/text/fr-fr/minutes.word new file mode 100644 index 0000000..5cf0e30 --- /dev/null +++ b/lingua_franca/res/text/fr-fr/minutes.word @@ -0,0 +1 @@ +minutes diff --git a/lingua_franca/res/text/fr-fr/second.word b/lingua_franca/res/text/fr-fr/second.word new file mode 100644 index 0000000..110f968 --- /dev/null +++ b/lingua_franca/res/text/fr-fr/second.word @@ -0,0 +1 @@ +seconde diff --git a/lingua_franca/res/text/fr-fr/seconds.word b/lingua_franca/res/text/fr-fr/seconds.word new file mode 100644 index 0000000..7eac62a --- /dev/null +++ b/lingua_franca/res/text/fr-fr/seconds.word @@ -0,0 +1 @@ +secondes diff --git a/lingua_franca/res/text/hu-hu/date_time.json b/lingua_franca/res/text/hu-hu/date_time.json new file mode 100644 index 0000000..9dcea53 --- /dev/null +++ b/lingua_franca/res/text/hu-hu/date_time.json @@ -0,0 +1,132 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^20$", "format": "húsz"}, + "4": {"match": "^\\d0$", "format": "{x0}"}, + "5": {"match": "^[2-9]\\d$", "format": "{x0}{x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^2\\d{2}$", "format": "kétszáz"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00}száz"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^1\\d{3}$", "format": "ezer"}, + "2": {"match": "^2\\d{3}$", "format": "kétezer"}, + "3": {"match": "^\\d{4}$", "format": "{x_in_x000}ezer"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{bc} {formatted_decade}"}, + "2": {"match": "^\\d000$", "format": "{bc} {formatted_thousand}"}, + "3": {"match": "^\\d{3}$", "format": "{bc} {formatted_hundreds}{formatted_decade}"}, + "4": {"match": "^[2-9]\\d00$", "format": "{bc} {formatted_thousand}-{formatted_hundreds}"}, + "5": {"match": "^1[0-9]00$", "format": "{bc} {formatted_thousand}{formatted_hundreds}"}, + "6": {"match": "^[2-9]0\\d{2}$", "format": "{bc} {formatted_thousand}-{formatted_decade}"}, + "7": {"match": "^10\\d{2}$", "format": "{bc} {formatted_thousand}{formatted_decade}"}, + "8": {"match": "^[2-9]00\\d$", "format": "{bc} {formatted_thousand}{formatted_decade}"}, + "9": {"match": "^1\\d{3}$", "format": "{bc} {formatted_thousand}{formatted_hundreds}{formatted_decade}"}, + "10": {"match": "^[2-9]\\d{3}$", "format": "{bc} {formatted_thousand}-{formatted_hundreds}{formatted_decade}"}, + "default": "{bc} {year}", + "bc": "kr.e." + }, + "date_format": { + "date_full": "{formatted_year} {month} {day}, {weekday}", + "date_full_no_year": "{month} {day}, {weekday}", + "date_full_no_year_month": "{day}, {weekday}", + "today": "ma", + "tomorrow": "holnap", + "yesterday": "tegnap" + }, + "date_time_format": { + "date_time": "{formatted_date}, {formatted_time}" + }, + "weekday": { + "0": "hétfő", + "1": "kedd", + "2": "szerda", + "3": "csütörtök", + "4": "péntek", + "5": "szombat", + "6": "vasárnap" + }, + "date": { + "1": "elseje", + "2": "másodika", + "3": "harmadika", + "4": "negyedike", + "5": "ötödike", + "6": "hatodika", + "7": "hetedike", + "8": "nyolcadika", + "9": "kilencedike", + "10": "tizedike", + "11": "tizenegyedike", + "12": "tizenkettedike", + "13": "tizenharmadika", + "14": "tizennegyedike", + "15": "tizenötödike", + "16": "tizenhatodika", + "17": "tizenhetedike", + "18": "tizennyolcadika", + "19": "tizenkilencedike", + "20": "huszadika", + "21": "huszonegyedike", + "22": "huszonkettedike", + "23": "huszonharmadika", + "24": "huszonnegyedike", + "25": "huszonötödike", + "26": "huszonhatodika", + "27": "huszonhetedike", + "28": "huszonnyolcadika", + "29": "huszonkilencedike", + "30": "harmincadika", + "31": "harmincegyedike" + }, + "month": { + "1": "január", + "2": "február", + "3": "március", + "4": "április", + "5": "május", + "6": "június", + "7": "július", + "8": "augusztus", + "9": "szeptember", + "10": "október", + "11": "november", + "12": "december" + }, + "number": { + "0": "nulla", + "1": "egy", + "2": "kettő", + "3": "három", + "4": "négy", + "5": "öt", + "6": "hat", + "7": "hét", + "8": "nyolc", + "9": "kilenc", + "10": "tíz", + "11": "tizenegy", + "12": "tizenkettő", + "13": "tizenhárom", + "14": "tizennégy", + "15": "tizenöt", + "16": "tizenhat", + "17": "tizenhét", + "18": "tizennyolc", + "19": "tizenkilenc", + "20": "huszon", + "30": "harminc", + "40": "negyven", + "50": "ötven", + "60": "hatvan", + "70": "hetven", + "80": "nyolcvan", + "90": "kilencven" + } +} diff --git a/lingua_franca/res/text/hu-hu/date_time_test.json b/lingua_franca/res/text/hu-hu/date_time_test.json new file mode 100644 index 0000000..cfe6601 --- /dev/null +++ b/lingua_franca/res/text/hu-hu/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. egy" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. tíz" }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. kilencvenkettő" }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nyolcszázhárom" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nyolcszáztizenegy" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "négyszázötvennégy" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezeröt" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezertizenkettő" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezernegyvenhat" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezernyolcszázhét" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezerhétszáztizenhét" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezerkilencszáznyolcvannyolc"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-kilenc"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-tizennyolc"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-huszonegy"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-harminc"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "kétezer-egyszáz" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezer" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. háromezer-egyszázhúsz" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. háromezer-kétszáznegyvenegy" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ötezer-kétszáz" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezeregyszáz" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "kétezer-egyszáz" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "kétezer-tizennyolc február negyedike, vasárnap"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "február negyedike, vasárnap"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "negyedike, vasárnap"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "holnap"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "ma"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "tegnap"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "február negyedike, vasárnap"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "kétezer-tizennyolc február negyedike, vasárnap"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd, délután egy óra huszonkettő"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd, tizenhárom óra huszonkettő"} + } +} diff --git a/lingua_franca/res/text/it-it/date_time.json b/lingua_franca/res/text/it-it/date_time.json new file mode 100644 index 0000000..4580b11 --- /dev/null +++ b/lingua_franca/res/text/it-it/date_time.json @@ -0,0 +1,153 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9][1|8]", "format": "{xx}"}, + "5": {"match": "^[2-9]\\d$", "format": "{x0}{x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "cento"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00}cento"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^1\\d{3}$", "format": "mille"}, + "2": {"match": "^\\d{4}$", "format": "{x_in_x000}mila"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d000$", "format": "{formatted_thousand} {bc}"}, + "4": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "5": {"match": "^1[0-9]00$", "format": "{formatted_thousand} {formatted_hundreds} {bc}"}, + "6": {"match": "^10\\d{2}$", "format": "{formatted_thousand} e {formatted_decade} {bc}"}, + "7": {"match": "^[2-9][0-9]00$", "format": "{formatted_thousand} {formatted_hundreds} {bc}"}, + "8": {"match": "^20\\d{2}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "9": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, + "10": {"match": "^[2-9]000$", "format": "{formatted_thousand} {bc}"}, + "11": {"match": "^20\\d{2}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "12": {"match": "^([2-9]\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, + "13": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "a.C." + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "oggi", + "tomorrow": "domani", + "yesterday": "ieri" + }, + "date_time_format": { + "date_time": "{formatted_date} alle {formatted_time}" + }, + "weekday": { + "0": "lunedì", + "1": "martedì", + "2": "mercoledì", + "3": "giovedì", + "4": "venerdì", + "5": "sabato", + "6": "domenica" + }, + "date": { + "1": "primo", + "2": "due", + "3": "tre", + "4": "quattro", + "5": "cinque", + "6": "sei", + "7": "sette", + "8": "otto", + "9": "nove", + "10": "dieci", + "11": "undici", + "12": "dodici", + "13": "tredici", + "14": "quattordici", + "15": "quindici", + "16": "sedici", + "17": "diciassette", + "18": "diciotto", + "19": "diciannove", + "20": "venti", + "21": "ventuno", + "22": "ventidue", + "23": "ventitre", + "24": "ventiquattro", + "25": "venticinque", + "26": "ventisei", + "27": "ventisette", + "28": "ventotto", + "29": "ventinove", + "30": "trenta", + "31": "trentuno" + }, + "month": { + "1": "gennaio", + "2": "febbraio", + "3": "marzo", + "4": "aprile", + "5": "maggio", + "6": "giugno", + "7": "luglio", + "8": "agosto", + "9": "settembre", + "10": "ottobre", + "11": "novembre", + "12": "dicembre" + }, + "number": { + "0": "zero", + "1": "uno", + "2": "due", + "3": "tre", + "4": "quattro", + "5": "cinque", + "6": "sei", + "7": "sette", + "8": "otto", + "9": "nove", + "10": "dieci", + "11": "undici", + "12": "dodici", + "13": "tredici", + "14": "quattordici", + "15": "quindici", + "16": "sedici", + "17": "diciassette", + "18": "diciotto", + "19": "diciannove", + "20": "venti", + "21": "ventuno", + "28": "ventotto", + "30": "trenta", + "31": "trentuno", + "38": "trentotto", + "40": "quaranta", + "41": "quarantuno", + "48": "quarantotto", + "50": "cinquanta", + "51": "cinquantuno", + "58": "cinquantotto", + "60": "sessanta", + "61": "sessantuno", + "68": "sessantotto", + "70": "settanta", + "71": "settantuno", + "78": "settantotto", + "80": "ottanta", + "81": "ottantuno", + "88": "ottantotto", + "90": "novanta", + "91": "novantuno", + "98": "novantotto", + "100": "cento", + "1000": "mille", + "2000": "duemila" + } +} diff --git a/lingua_franca/res/text/it-it/date_time_test.json b/lingua_franca/res/text/it-it/date_time_test.json new file mode 100644 index 0000000..f646c5c --- /dev/null +++ b/lingua_franca/res/text/it-it/date_time_test.json @@ -0,0 +1,42 @@ +{ "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "uno a.C." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dieci a.C." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "novantadue a.C." }, + "4": {"datetime_param": "100, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "cento" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ottocento undici" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quattrocento cinquantaquattro" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e cinque" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e dodici" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e quarantasei" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille ottocento sette" }, + "11": {"datetime_param": "1700, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille settecento" }, + "12": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille settecento diciassette" }, + "13": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille novecento ottantotto"}, + "14": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila nove"}, + "15": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila diciotto"}, + "16": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila ventuno"}, + "17": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila trenta"}, + "18": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duemila cento" }, + "19": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille" }, + "20": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila" }, + "21": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tremila cento venti a.C." }, + "22": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tremila duecento quarantuno a.C." }, + "23": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinquemila duecento" }, + "24": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille cento" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "martedì, trentuno gennaio, duemila diciassette"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "domenica, quattro febbraio, duemila diciotto"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "domenica, quattro febbraio"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "domenica, quattro"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "domani"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "oggi"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "ieri"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "domenica, quattro febbraio"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "domenica, quattro febbraio, duemila diciotto"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "martedì, trentuno gennaio, duemila diciassette alle una e ventidue del pomeriggio"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "martedì, trentuno gennaio, duemila diciassette alle tredici e ventidue"} + } +} diff --git a/lingua_franca/res/text/it-it/day.word b/lingua_franca/res/text/it-it/day.word new file mode 100644 index 0000000..e5d9b11 --- /dev/null +++ b/lingua_franca/res/text/it-it/day.word @@ -0,0 +1 @@ +giorno diff --git a/lingua_franca/res/text/it-it/days.word b/lingua_franca/res/text/it-it/days.word new file mode 100644 index 0000000..f042e0c --- /dev/null +++ b/lingua_franca/res/text/it-it/days.word @@ -0,0 +1 @@ +giorni diff --git a/lingua_franca/res/text/it-it/hour.word b/lingua_franca/res/text/it-it/hour.word new file mode 100644 index 0000000..aab78c9 --- /dev/null +++ b/lingua_franca/res/text/it-it/hour.word @@ -0,0 +1 @@ +ora diff --git a/lingua_franca/res/text/it-it/hours.word b/lingua_franca/res/text/it-it/hours.word new file mode 100644 index 0000000..b9f40a7 --- /dev/null +++ b/lingua_franca/res/text/it-it/hours.word @@ -0,0 +1 @@ +ore diff --git a/lingua_franca/res/text/it-it/minute.word b/lingua_franca/res/text/it-it/minute.word new file mode 100644 index 0000000..9b63882 --- /dev/null +++ b/lingua_franca/res/text/it-it/minute.word @@ -0,0 +1 @@ +minuto diff --git a/lingua_franca/res/text/it-it/minutes.word b/lingua_franca/res/text/it-it/minutes.word new file mode 100644 index 0000000..a4cb0ec --- /dev/null +++ b/lingua_franca/res/text/it-it/minutes.word @@ -0,0 +1 @@ +minuti diff --git a/lingua_franca/res/text/it-it/second.word b/lingua_franca/res/text/it-it/second.word new file mode 100644 index 0000000..254add6 --- /dev/null +++ b/lingua_franca/res/text/it-it/second.word @@ -0,0 +1 @@ +secondo diff --git a/lingua_franca/res/text/it-it/seconds.word b/lingua_franca/res/text/it-it/seconds.word new file mode 100644 index 0000000..bdfb0db --- /dev/null +++ b/lingua_franca/res/text/it-it/seconds.word @@ -0,0 +1 @@ +secondi diff --git a/lingua_franca/res/text/nl-nl/date_time.json b/lingua_franca/res/text/nl-nl/date_time.json new file mode 100644 index 0000000..b1f94fa --- /dev/null +++ b/lingua_franca/res/text/nl-nl/date_time.json @@ -0,0 +1,136 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x} en {x0}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "honderd"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00} honderd"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d\\d$", "format": "duizend"}, + "2": {"match": "^\\d0\\d{2}$", "format": "{x_in_x000} duizend"}, + "3": {"match": "^1\\d00$", "format": "{xx_in_xx00} honderd"}, + "4": {"match": "^\\d{2}00$", "format": "{x_in_x00} en {x0_in_x000} honderd"}, + "5": {"match": "^\\d0\\d\\d$", "format": "{x_in_x000} duizend"}, + "6": {"match": "^1\\d{3}$", "format": "{xx_in_xx00}"}, + "7": {"match": "^\\d{4}$", "format": "{x_in_0x00} en {x0_in_x000}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d{1}?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d{2}?$", "format": "{formatted_decade} {bc}"}, + "3": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "4": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "5": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "6": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "7": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "8": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "v.c." + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day} {month}", + "today": "vandaag", + "tomorrow": "morgen", + "yesterday": "gisteren" + }, + "time_format": { + "time_full": "{minutes} over {hour}" + }, + "date_time_format": { + "date_time": "{formatted_date} om {formatted_time}" + }, + "weekday": { + "0": "maandag", + "1": "dinsdag", + "2": "woensdag", + "3": "donderdag", + "4": "vrijdag", + "5": "zaterdag", + "6": "zondag" + }, + "date": { + "1": "een", + "2": "twee", + "3": "drie", + "4": "vier", + "5": "vijf", + "6": "zes", + "7": "zeven", + "8": "acht", + "9": "negen", + "10": "tien", + "11": "elf", + "12": "twaalf", + "13": "dertien", + "14": "veertien", + "15": "vijtien", + "16": "zestien", + "17": "zeventien", + "18": "achttien", + "19": "negentien", + "20": "twintig", + "21": "eenentwintig", + "22": "tweeentwintig", + "23": "drieentwintig", + "24": "vierentwintig", + "25": "vijfentwintig", + "26": "zesentwintig", + "27": "zevenentwintig", + "28": "achtentwintig", + "29": "negenentwintig", + "30": "dertig", + "31": "eenendertig" + }, + "month": { + "1": "januari", + "2": "februari", + "3": "maart", + "4": "april", + "5": "mei", + "6": "juni", + "7": "juli", + "8": "augustus", + "9": "september", + "10": "oktober", + "11": "november", + "12": "december" + }, + "number": { + "0": "nul", + "1": "een", + "2": "twee", + "3": "drie", + "4": "vier", + "5": "vijf", + "6": "zes", + "7": "zeven", + "8": "acht", + "9": "negen", + "10": "tien", + "11": "elf", + "12": "twaalf", + "13": "dertien", + "14": "veertien", + "15": "vijtien", + "16": "zestien", + "17": "zeventien", + "18": "achttien", + "19": "negentien", + "20": "twintig", + "30": "dertig", + "40": "veertig", + "50": "vijftig", + "60": "zestig", + "70": "zeventig", + "80": "tachtig", + "90": "negentig" + } +} diff --git a/lingua_franca/res/text/nl-nl/date_time_test.json b/lingua_franca/res/text/nl-nl/date_time_test.json new file mode 100644 index 0000000..2486c28 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "een v.c." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tien v.c." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "twee en negentig v.c." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht honderd drie" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht honderd elf" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vier honderd vier en vijftig" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend vijf" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend twaalf" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend zes en veertig" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "achttien zeven" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zeventien zeventien" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "negentien acht en tachtig"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend negen"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend achttien"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend een en twintig"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend dertig"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "een en twintig honderd" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duizend" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "een en dertig twintig v.c." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "twee en dertig een en veertig v.c." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twee en vijftig honderd" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elf honderd" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "een en twintig honderd" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "zondag, vier februari, twee duizend achttien"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "zondag, vier februari"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "zondag, vier februari"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "morgen"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "vandaag"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "gisteren"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "zondag, vier februari"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "zondag, vier februari, twee duizend achttien"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien om tweeentwintig over één 's middags"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien om dertien uur tweeentwintig"} + } +} diff --git a/lingua_franca/res/text/nl-nl/day.word b/lingua_franca/res/text/nl-nl/day.word new file mode 100644 index 0000000..12ad2ae --- /dev/null +++ b/lingua_franca/res/text/nl-nl/day.word @@ -0,0 +1 @@ +dag diff --git a/lingua_franca/res/text/nl-nl/days.word b/lingua_franca/res/text/nl-nl/days.word new file mode 100644 index 0000000..d0ad4ed --- /dev/null +++ b/lingua_franca/res/text/nl-nl/days.word @@ -0,0 +1 @@ +dagen diff --git a/lingua_franca/res/text/nl-nl/hour.word b/lingua_franca/res/text/nl-nl/hour.word new file mode 100644 index 0000000..4421f65 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/hour.word @@ -0,0 +1 @@ +uur diff --git a/lingua_franca/res/text/nl-nl/hours.word b/lingua_franca/res/text/nl-nl/hours.word new file mode 100644 index 0000000..219d3b5 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/hours.word @@ -0,0 +1 @@ +uren diff --git a/lingua_franca/res/text/nl-nl/minute.word b/lingua_franca/res/text/nl-nl/minute.word new file mode 100644 index 0000000..342ac41 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/minute.word @@ -0,0 +1 @@ +minuut diff --git a/lingua_franca/res/text/nl-nl/minutes.word b/lingua_franca/res/text/nl-nl/minutes.word new file mode 100644 index 0000000..0d2e320 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/minutes.word @@ -0,0 +1 @@ +minuten diff --git a/lingua_franca/res/text/nl-nl/second.word b/lingua_franca/res/text/nl-nl/second.word new file mode 100644 index 0000000..110f968 --- /dev/null +++ b/lingua_franca/res/text/nl-nl/second.word @@ -0,0 +1 @@ +seconde diff --git a/lingua_franca/res/text/nl-nl/seconds.word b/lingua_franca/res/text/nl-nl/seconds.word new file mode 100644 index 0000000..7995c0c --- /dev/null +++ b/lingua_franca/res/text/nl-nl/seconds.word @@ -0,0 +1 @@ +seconden diff --git a/lingua_franca/res/text/pl-pl/and.word b/lingua_franca/res/text/pl-pl/and.word new file mode 100644 index 0000000..ea14f36 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/and.word @@ -0,0 +1,2 @@ +i +oraz diff --git a/lingua_franca/res/text/pl-pl/date_time.json b/lingua_franca/res/text/pl-pl/date_time.json new file mode 100644 index 0000000..2930916 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/date_time.json @@ -0,0 +1,129 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tysiąc"}, + "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, + "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, + "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, + "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, + "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "p.n.e." + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "dziś", + "tomorrow": "jutro", + "yesterday": "wczoraj" + }, + "date_time_format": { + "date_time": "{formatted_date} at {formatted_time}" + }, + "weekday": { + "0": "poniedziałek", + "1": "wtorek", + "2": "środa", + "3": "czwartek", + "4": "piątek", + "5": "sobota", + "6": "niedziela" + }, + "date": { + "1": "pierwszy", + "2": "drugi", + "3": "trzeci", + "4": "czwarty", + "5": "piąty", + "6": "szósty", + "7": "siódmy", + "8": "ósmy", + "9": "dziewiąty", + "10": "dziesiąty", + "11": "jedenast", + "12": "dwunasty", + "13": "trzynasty", + "14": "czternasty", + "15": "piętnasty", + "16": "szesnasty", + "17": "siedemnasty", + "18": "osiemnasty", + "19": "dziewiętnasty", + "20": "dwudziesty", + "21": "dwudziesty pierwszy", + "22": "dwudziesty drugi", + "23": "dwudziesty trzeci", + "24": "dwudziesty czwarty", + "25": "dwudziesty piąty", + "26": "dwudziesty szósty", + "27": "dwudziesty siódmy", + "28": "dwudziesty ósmy", + "29": "dwudziesty dziewiąty", + "30": "trzydziesty", + "31": "trzydziesty pierwszy" + }, + "month": { + "1": "styczeń", + "2": "luty", + "3": "marzec", + "4": "kwiecień", + "5": "maj", + "6": "czerwiec", + "7": "lipiec", + "8": "sierpień", + "9": "wrzesień", + "10": "październik", + "11": "listopad", + "12": "grudzień" + }, + "number": { + "0": "zero", + "1": "jeden", + "2": "dwa", + "3": "trzy", + "4": "cztery", + "5": "pięć", + "6": "sześć", + "7": "siedem", + "8": "osiem", + "9": "dziewięc", + "10": "dziesięć", + "11": "jedenaście", + "12": "dwanaście", + "13": "trzynaście", + "14": "czternaście", + "15": "piętnaście", + "16": "szesnaście", + "17": "siedemnaście", + "18": "osiemnaście", + "19": "dziewiętnaście", + "20": "dwadzieścia", + "30": "trzydzieści", + "40": "czterdzieści", + "50": "pięćdziesiąt", + "60": "sześćdziesiąt", + "70": "siedemdziesiąt", + "80": "osiemdziesiąt", + "90": "dziewięćdziesiąt" + } +} diff --git a/lingua_franca/res/text/pl-pl/day.word b/lingua_franca/res/text/pl-pl/day.word new file mode 100644 index 0000000..fdd7af4 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/day.word @@ -0,0 +1 @@ +dzień diff --git a/lingua_franca/res/text/pl-pl/days.word b/lingua_franca/res/text/pl-pl/days.word new file mode 100644 index 0000000..a7738f0 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/days.word @@ -0,0 +1 @@ +dni diff --git a/lingua_franca/res/text/pl-pl/hour.word b/lingua_franca/res/text/pl-pl/hour.word new file mode 100644 index 0000000..8d8bde0 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/hour.word @@ -0,0 +1 @@ +godzina diff --git a/lingua_franca/res/text/pl-pl/hours.word b/lingua_franca/res/text/pl-pl/hours.word new file mode 100644 index 0000000..0973a51 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/hours.word @@ -0,0 +1 @@ +godzin diff --git a/lingua_franca/res/text/pl-pl/minute.word b/lingua_franca/res/text/pl-pl/minute.word new file mode 100644 index 0000000..1b52e61 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/minute.word @@ -0,0 +1 @@ +minuta diff --git a/lingua_franca/res/text/pl-pl/minutes.word b/lingua_franca/res/text/pl-pl/minutes.word new file mode 100644 index 0000000..7a715f8 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/minutes.word @@ -0,0 +1 @@ +minut diff --git a/lingua_franca/res/text/pl-pl/or.word b/lingua_franca/res/text/pl-pl/or.word new file mode 100644 index 0000000..dee660d --- /dev/null +++ b/lingua_franca/res/text/pl-pl/or.word @@ -0,0 +1,2 @@ +lub +albo diff --git a/lingua_franca/res/text/pl-pl/second.word b/lingua_franca/res/text/pl-pl/second.word new file mode 100644 index 0000000..f07a357 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/second.word @@ -0,0 +1 @@ +sekunda diff --git a/lingua_franca/res/text/pl-pl/seconds.word b/lingua_franca/res/text/pl-pl/seconds.word new file mode 100644 index 0000000..bbf810e --- /dev/null +++ b/lingua_franca/res/text/pl-pl/seconds.word @@ -0,0 +1 @@ +sekund diff --git a/lingua_franca/res/text/pt-pt/normalize.json b/lingua_franca/res/text/pt-pt/normalize.json new file mode 100644 index 0000000..52fd4b8 --- /dev/null +++ b/lingua_franca/res/text/pt-pt/normalize.json @@ -0,0 +1,98 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": true, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "catorze": "14", + "cem": "100", + "cento": "100", + "cinco": "5", + "cinquenta": "50", + "dez": "10", + "dezanove": "19", + "dezasseis": "16", + "dezassete": "17", + "dezoito": "18", + "dois": "2", + "doze": "12", + "duas": "2", + "duzentas": "200", + "duzentos": "200", + "mil": "1000", + "milhão": "1000000", + "nove": "9", + "novecentas": "900", + "novecentos": "900", + "noventa": "90", + "oitenta": "80", + "oito": "8", + "oitocentas": "800", + "oitocentos": "800", + "onze": "11", + "primeiro": "1", + "quarenta": "40", + "quatro": "4", + "quatrocentas": "400", + "quatrocentos": "400", + "quinhentas": "500", + "quinhentos": "500", + "quinze": "15", + "segundo": "2", + "seis": "6", + "seiscentas": "600", + "seiscentos": "600", + "sessenta": "60", + "sete": "7", + "setecentas": "700", + "setecentos": "700", + "setenta": "70", + "terceiro": "3", + "tres": "3", + "treze": "13", + "trezentas": "300", + "trezentos": "300", + "trinta": "30", + "três": "3", + "um": "1", + "uma": "1", + "vinte": "20", + "zero": "0" + }, + "stopwords": [ + "de", + "dos", + "das", + "lhe", + "lhes", + "me", + "e", + "no", + "nas", + "na", + "nos", + "em", + "para", + "este", + "esta", + "deste", + "desta", + "neste", + "nesta", + "nesse", + "nessa", + "foi", + "que" + ], + "articles": [ + "o", + "a", + "os", + "as" + ] +} \ No newline at end of file diff --git a/lingua_franca/res/text/ru-ru/date_time.json b/lingua_franca/res/text/ru-ru/date_time.json new file mode 100644 index 0000000..6c83547 --- /dev/null +++ b/lingua_franca/res/text/ru-ru/date_time.json @@ -0,0 +1,149 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "сто"}, + "2": {"match": "^2\\d{2}$", "format": "двести"}, + "3": {"match": "^[34]\\d{2}$", "format": "{x_in_x00}ста"}, + "4": {"match": "^\\d{3}$", "format": "{x_in_x00}сот"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d{2}$", "format": "тысяча"}, + "2": {"match": "^11\\d{2}$", "format": "тысяча сто"}, + "3": {"match": "^12\\d{2}$", "format": "тысяча двести"}, + "4": {"match": "^1[34]\\d{2}$", "format": "тысяча {x_in_x00}ста"}, + "5": {"match": "^1\\d{3}$", "format": "тысяча {x_in_x00}сот"}, + + "6": {"match": "^20\\d{2}$", "format": "две тысячи"}, + "7": {"match": "^21\\d{2}$", "format": "две тысячи сто"}, + "8": {"match": "^22\\d{2}$", "format": "две тысячи двести"}, + "9": {"match": "^2[34]\\d{2}$", "format": "две тысячи {x_in_x00}ста"}, + "10": {"match": "^2\\d{3}$", "format": "две тысячи {x_in_x00}сот"}, + + "11": {"match": "^[34]0\\d{2}$", "format": "{x_in_x000} тысячи"}, + "12": {"match": "^[34]1\\d{2}$", "format": "{x_in_x000} тысячи сто"}, + "13": {"match": "^[34]2\\d{2}$", "format": "{x_in_x000} тысячи двести"}, + "14": {"match": "^[34][34]\\d{2}$", "format": "{x_in_x000} тысячи {x_in_x00}ста"}, + "15": {"match": "^[34]\\d{3}$", "format": "{x_in_x000} тысячи {x_in_x00}сот"}, + + "16": {"match": "^[5-9]0\\d{2}$", "format": "{x_in_x000} тысяч"}, + "17": {"match": "^[5-9]1\\d{2}$", "format": "{x_in_x000} тысяч сто"}, + "18": {"match": "^[5-9]2\\d{2}$", "format": "{x_in_x000} тысяч двести"}, + "19": {"match": "^[5-9][34]\\d{2}$", "format": "{x_in_x000} тысяч {x_in_x00}ста"}, + "20": {"match": "^[5-9]\\d{3}$", "format": "{x_in_x000} тысяч {x_in_x00}сот"}, + + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "до нашей эры" + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "сегодня", + "tomorrow": "завтра", + "yesterday": "вчера" + }, + "date_time_format": { + "date_time": "{formatted_date} в {formatted_time}" + }, + "weekday": { + "0": "в понедельник", + "1": "во вторник", + "2": "в среду", + "3": "в четверг", + "4": "в пятницу", + "5": "в субботу", + "6": "в воскресенье" + }, + "date": { + "1": "первого", + "2": "второго", + "3": "третьего", + "4": "четвёртого", + "5": "пятого", + "6": "шестого", + "7": "седьмого", + "8": "восьмого", + "9": "девятого", + "10": "десятого", + "11": "одиннадцатого", + "12": "двенадцатого", + "13": "тринадцатого", + "14": "четырнадцатого", + "15": "пятнадцатого", + "16": "шестнадцатого", + "17": "семнадцатого", + "18": "восемнадцатого", + "19": "девятнадцатого", + "20": "двадцатого", + "21": "двадцать первого", + "22": "двадцать второго", + "23": "двадцать третьего", + "24": "двадцать четвёртого", + "25": "двадцать пятого", + "26": "двадцать шестого", + "27": "двадцать седьмого", + "28": "двадцать восьмого", + "29": "двадцать девятого", + "30": "тридцатого", + "31": "тридцать первого" + }, + "month": { + "1": "января", + "2": "февраля", + "3": "марта", + "4": "апреля", + "5": "мая", + "6": "июня", + "7": "июля", + "8": "августа", + "9": "сентября", + "10": "октября", + "11": "ноября", + "12": "декабря" + }, + "number": { + "0": "ноль", + "1": "один", + "2": "два", + "3": "три", + "4": "четыре", + "5": "пять", + "6": "шесть", + "7": "семь", + "8": "восемь", + "9": "девять", + "10": "десять", + "11": "одиннадцать", + "12": "двенадцать", + "13": "тринадцать", + "14": "четырнадцать", + "15": "пятнадцать", + "16": "шестнадцать", + "17": "семнадцать", + "18": "восемнадцать", + "19": "девятнадцать", + "20": "двадцать", + "30": "тридцать", + "40": "сорок", + "50": "пятьдесят", + "60": "шестьдесят", + "70": "семьдесят", + "80": "восемьдесят", + "90": "девяносто" + } +} diff --git a/lingua_franca/res/text/ru-ru/date_time_test.json b/lingua_franca/res/text/ru-ru/date_time_test.json new file mode 100644 index 0000000..ac82786 --- /dev/null +++ b/lingua_franca/res/text/ru-ru/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "один до нашей эры" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "десять до нашей эры" }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "девяносто два до нашей эры" }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "восемьсот три" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "восемьсот одиннадцать" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "четыреста пятьдесят четыре" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча пять" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча двенадцать" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча сорок шесть" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча восемьсот семь" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча семьсот семнадцать" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча девятьсот восемьдесят восемь"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи девять"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи восемнадцать"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи двадцать один"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи тридцать"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "две тысячи сто" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "три тысячи сто двадцать до нашей эры" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "три тысячи двести сорок один до нашей эры" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "пять тысяч двести" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча сто" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "две тысячи сто" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля, две тысячи восемнадцать"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "завтра"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "сегодня"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "вчера"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля, две тысячи восемнадцать"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать в час двадцать два дня"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать в тринадцать двадцать два"} + } +} diff --git a/lingua_franca/res/text/ru-ru/day.word b/lingua_franca/res/text/ru-ru/day.word new file mode 100644 index 0000000..bcb325e --- /dev/null +++ b/lingua_franca/res/text/ru-ru/day.word @@ -0,0 +1 @@ +день diff --git a/lingua_franca/res/text/ru-ru/days.word b/lingua_franca/res/text/ru-ru/days.word new file mode 100644 index 0000000..d170fff --- /dev/null +++ b/lingua_franca/res/text/ru-ru/days.word @@ -0,0 +1 @@ +дней diff --git a/lingua_franca/res/text/ru-ru/hour.word b/lingua_franca/res/text/ru-ru/hour.word new file mode 100644 index 0000000..4ca5e16 --- /dev/null +++ b/lingua_franca/res/text/ru-ru/hour.word @@ -0,0 +1 @@ +час diff --git a/lingua_franca/res/text/ru-ru/hours.word b/lingua_franca/res/text/ru-ru/hours.word new file mode 100644 index 0000000..80c7bea --- /dev/null +++ b/lingua_franca/res/text/ru-ru/hours.word @@ -0,0 +1 @@ +часов diff --git a/lingua_franca/res/text/ru-ru/minute.word b/lingua_franca/res/text/ru-ru/minute.word new file mode 100644 index 0000000..8a4bf42 --- /dev/null +++ b/lingua_franca/res/text/ru-ru/minute.word @@ -0,0 +1 @@ +минута diff --git a/lingua_franca/res/text/ru-ru/minutes.word b/lingua_franca/res/text/ru-ru/minutes.word new file mode 100644 index 0000000..00b261a --- /dev/null +++ b/lingua_franca/res/text/ru-ru/minutes.word @@ -0,0 +1 @@ +минут diff --git a/lingua_franca/res/text/ru-ru/normalize.json b/lingua_franca/res/text/ru-ru/normalize.json new file mode 100644 index 0000000..49bb02b --- /dev/null +++ b/lingua_franca/res/text/ru-ru/normalize.json @@ -0,0 +1,46 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "ноль": "0", + "нуль": "0", + "один": "1", + "одна": "1", + "два": "2", + "две": "2", + "три": "3", + "четыре": "4", + "пять": "5", + "шесть": "6", + "семь": "7", + "восемь": "8", + "девять": "9", + "десять": "10", + "одиннадцать": "11", + "двенадцать": "12", + "тринадцать": "13", + "четырнадцать": "14", + "пятнадцать": "15", + "шестнадцать": "16", + "семнадцать": "17", + "восемнадцать": "18", + "девятнадцать": "19", + "двадцать": "20", + "тридцать": "30", + "сорок": "40", + "пятьдесят": "50", + "шестьдесят": "60", + "семьдесят": "70", + "восемьдесят": "80", + "девяносто": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/lingua_franca/res/text/ru-ru/second.word b/lingua_franca/res/text/ru-ru/second.word new file mode 100644 index 0000000..f1be080 --- /dev/null +++ b/lingua_franca/res/text/ru-ru/second.word @@ -0,0 +1 @@ +секунда diff --git a/lingua_franca/res/text/ru-ru/seconds.word b/lingua_franca/res/text/ru-ru/seconds.word new file mode 100644 index 0000000..ba1da0d --- /dev/null +++ b/lingua_franca/res/text/ru-ru/seconds.word @@ -0,0 +1 @@ +секунд diff --git a/lingua_franca/res/text/sl-si/and.word b/lingua_franca/res/text/sl-si/and.word new file mode 100644 index 0000000..f087d89 --- /dev/null +++ b/lingua_franca/res/text/sl-si/and.word @@ -0,0 +1 @@ +in \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/date_time.json b/lingua_franca/res/text/sl-si/date_time.json new file mode 100644 index 0000000..9da33e4 --- /dev/null +++ b/lingua_franca/res/text/sl-si/date_time.json @@ -0,0 +1,123 @@ +{ + "decade_format": { + "1": {"match": "^0$", "format": ""}, + "2": {"match": "^\\d$", "format": "{x}"}, + "3": {"match": "^1\\d$", "format": "{xx}"}, + "4": {"match": "^\\d0$", "format": "{x0}"}, + "5": {"match": "^[2-9]\\d$", "format": "{x}in{x0}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{1,2}$", "format": "{formatted_decade}"}, + "2": {"match": "^1\\d{2}$", "format": "sto {formatted_decade}"}, + "3": {"match": "^2\\d{2}$", "format": "dvesto {formatted_decade}"}, + "4": {"match": "^\\d{3}$", "format": "{x_in_x00}sto {formatted_decade}"}, + "default": "{formatted_decade}" + }, + "thousand_format": { + "1": {"match": "^\\d{1,3}$", "format": "{formatted_hundreds}"}, + "2": {"match": "^1\\d{3}$", "format": "tisoč {formatted_hundreds}"}, + "default": "{x_in_x000} tisoč {formatted_hundreds}" + }, + "year_format": { + "default": "{formatted_thousand} {bc}", + "bc": "pr. n. št." + }, + "date_format": { + "date_full": "{weekday}, {day} {month} {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "danes", + "tomorrow": "jutri", + "yesterday": "včeraj" + }, + "date_time_format": { + "date_time": "{formatted_date}, ob {formatted_time}" + }, + "weekday": { + "0": "ponedeljek", + "1": "torek", + "2": "sreda", + "3": "četrtek", + "4": "petek", + "5": "sobota", + "6": "nedelja" + }, + "date": { + "1": "prvi", + "2": "drugi", + "3": "tretji", + "4": "četrti", + "5": "peti", + "6": "šesti", + "7": "sedmi", + "8": "osmi", + "9": "deveti", + "10": "deseti", + "11": "enajsti", + "12": "dvanajsti", + "13": "trinajsti", + "14": "štirinajsti", + "15": "petjanjsti", + "16": "šestnajsti", + "17": "sedemnajsti", + "18": "osemnajsti", + "19": "devetnajsti", + "20": "dvajseti", + "21": "enaindvajseti", + "22": "dvaindvajseti", + "23": "triindvajseti", + "24": "štiriindvajseti", + "25": "petindvajseti", + "26": "šestindvajseti", + "27": "sedemindvajseti", + "28": "osemindvajseti", + "29": "devetindvajseti", + "30": "trideseti", + "31": "enaintrideseti" + }, + "month": { + "1": "januar", + "2": "februar", + "3": "marec", + "4": "april", + "5": "maj", + "6": "junij", + "7": "julij", + "8": "avgust", + "9": "september", + "10": "oktober", + "11": "november", + "12": "december" + }, + "number": { + "0": "nič", + "1": "ena", + "2": "dva", + "3": "tri", + "4": "štiri", + "5": "pet", + "6": "šest", + "7": "sedem", + "8": "osem", + "9": "devet", + "10": "deset", + "11": "enajst", + "12": "dvanajst", + "13": "trinajst", + "14": "štirinajst", + "15": "petnajst", + "16": "šestnajst", + "17": "sedemnajst", + "18": "osemnajst", + "19": "devetnajst", + "20": "dvajset", + "30": "trideset", + "40": "štirideset", + "50": "petdeset", + "60": "šestdeset", + "70": "sedemdeset", + "80": "osemdeset", + "90": "devetdeset" + } +} diff --git a/lingua_franca/res/text/sl-si/date_time_test.json b/lingua_franca/res/text/sl-si/date_time_test.json new file mode 100644 index 0000000..7bebae0 --- /dev/null +++ b/lingua_franca/res/text/sl-si/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ena pr. n. št." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deset pr. n. št." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dvaindevetdeset pr. n. št." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osemsto tri" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osemsto enajst" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "štiristo štiriinpetdeset" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč pet" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč dvanajst" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč šestinštirideset" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč osemsto sedem" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč sedemsto sedemnajst" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč devetsto oseminosemdeset"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč devet"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč osemnajst"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč enaindvajset"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč trideset"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dva tisoč sto" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tri tisoč sto dvajset pr. n. št." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tri tisoč dvesto enainštirideset pr. n. št." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "pet tisoč dvesto" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč sto" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dva tisoč sto" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "nedelja, četrti februar dva tisoč osemnajst"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "nedelja, četrti februar"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "nedelja, četrti"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "jutri"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "danes"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "včeraj"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "nedelja, četrti februar"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "nedelja, četrti februar dva tisoč osemnajst"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst, ob dvaindvajset čez ena p.m."}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst, ob trinajst dvaindvajset"} + } +} diff --git a/lingua_franca/res/text/sl-si/day.word b/lingua_franca/res/text/sl-si/day.word new file mode 100644 index 0000000..dc816d9 --- /dev/null +++ b/lingua_franca/res/text/sl-si/day.word @@ -0,0 +1 @@ +dan \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/days.word b/lingua_franca/res/text/sl-si/days.word new file mode 100644 index 0000000..7b0e614 --- /dev/null +++ b/lingua_franca/res/text/sl-si/days.word @@ -0,0 +1 @@ +dni \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/hour.word b/lingua_franca/res/text/sl-si/hour.word new file mode 100644 index 0000000..fa6c4e1 --- /dev/null +++ b/lingua_franca/res/text/sl-si/hour.word @@ -0,0 +1 @@ +ura \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/hours.word b/lingua_franca/res/text/sl-si/hours.word new file mode 100644 index 0000000..873003f --- /dev/null +++ b/lingua_franca/res/text/sl-si/hours.word @@ -0,0 +1 @@ +ur \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/minute.word b/lingua_franca/res/text/sl-si/minute.word new file mode 100644 index 0000000..02810df --- /dev/null +++ b/lingua_franca/res/text/sl-si/minute.word @@ -0,0 +1 @@ +minuta \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/minutes.word b/lingua_franca/res/text/sl-si/minutes.word new file mode 100644 index 0000000..4b98366 --- /dev/null +++ b/lingua_franca/res/text/sl-si/minutes.word @@ -0,0 +1 @@ +minut \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/normalize.json b/lingua_franca/res/text/sl-si/normalize.json new file mode 100644 index 0000000..a0892fd --- /dev/null +++ b/lingua_franca/res/text/sl-si/normalize.json @@ -0,0 +1,44 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "nič": "0", + "ena": "1", + "dve": "2", + "dva": "2", + "tri": "3", + "štiri": "4", + "pet": "5", + "šest": "6", + "sedem": "7", + "osem": "8", + "devet": "9", + "deset": "10", + "enajst": "11", + "dvanajst": "12", + "trinajst": "13", + "štirinajst": "14", + "petnajst": "15", + "šestnajst": "16", + "sedemnajst": "17", + "osemnajst": "18", + "devetnajst": "19", + "dvajset": "20", + "trideset": "30", + "štirideset": "40", + "petdeset": "50", + "šestdeset": "60", + "sedemdeset": "70", + "osemdeset": "80", + "devetdeset": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/or.word b/lingua_franca/res/text/sl-si/or.word new file mode 100644 index 0000000..784f900 --- /dev/null +++ b/lingua_franca/res/text/sl-si/or.word @@ -0,0 +1 @@ +ali \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/second.word b/lingua_franca/res/text/sl-si/second.word new file mode 100644 index 0000000..ef210e2 --- /dev/null +++ b/lingua_franca/res/text/sl-si/second.word @@ -0,0 +1 @@ +sekunda \ No newline at end of file diff --git a/lingua_franca/res/text/sl-si/seconds.word b/lingua_franca/res/text/sl-si/seconds.word new file mode 100644 index 0000000..300f8e5 --- /dev/null +++ b/lingua_franca/res/text/sl-si/seconds.word @@ -0,0 +1 @@ +sekund \ No newline at end of file diff --git a/lingua_franca/res/text/sv-se/date_time.json b/lingua_franca/res/text/sv-se/date_time.json new file mode 100644 index 0000000..7c43912 --- /dev/null +++ b/lingua_franca/res/text/sv-se/date_time.json @@ -0,0 +1,129 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundra"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tusen"}, + "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundra"}, + "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundra"}, + "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, + "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} noll {formatted_decade} {bc}"}, + "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "före kristus" + }, + "date_format": { + "date_full": "{weekday}, den {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, den {day} {month}", + "date_full_no_year_month": "{weekday}, den {day}", + "today": "idag", + "tomorrow": "imorgon", + "yesterday": "igår" + }, + "date_time_format": { + "date_time": "{formatted_date} klockan {formatted_time}" + }, + "weekday": { + "0": "måndag", + "1": "tisdag", + "2": "onsdag", + "3": "torsdag", + "4": "fredag", + "5": "lördag", + "6": "söndag" + }, + "date": { + "1": "första", + "2": "andra", + "3": "tredje", + "4": "fjärde", + "5": "femte", + "6": "sjätte", + "7": "sjunde", + "8": "åttonde", + "9": "nionde", + "10": "tionde", + "11": "elfte", + "12": "tolfte", + "13": "trettonde", + "14": "fjortonde", + "15": "femtonde", + "16": "sextonde", + "17": "sjuttonde", + "18": "artonde", + "19": "nittonde", + "20": "tjugonde", + "21": "tjugoförsta", + "22": "tjugoandra", + "23": "tjugotredje", + "24": "tjugofjärde", + "25": "tjugofemte", + "26": "tjugosjätte", + "27": "tjugosjunde", + "28": "tjugoåttonde", + "29": "tjugonionde", + "30": "trettionde", + "31": "trettiförsta" + }, + "month": { + "1": "januari", + "2": "februari", + "3": "mars", + "4": "april", + "5": "maj", + "6": "juni", + "7": "juli", + "8": "augusti", + "9": "september", + "10": "oktober", + "11": "november", + "12": "december" + }, + "number": { + "0": "noll", + "1": "ett", + "2": "två", + "3": "tre", + "4": "fyra", + "5": "fem", + "6": "sex", + "7": "sju", + "8": "åtta", + "9": "nio", + "10": "tio", + "11": "elva", + "12": "tolv", + "13": "tretton", + "14": "fjorton", + "15": "femton", + "16": "sexton", + "17": "sjutton", + "18": "arton", + "19": "nitton", + "20": "tjugo", + "30": "trettio", + "40": "förtio", + "50": "femtio", + "60": "sextio", + "70": "sjuttio", + "80": "åttio", + "90": "nittio" + } +} diff --git a/lingua_franca/res/text/sv-se/date_time_test.json b/lingua_franca/res/text/sv-se/date_time_test.json new file mode 100644 index 0000000..0fda399 --- /dev/null +++ b/lingua_franca/res/text/sv-se/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ett före kristus" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tio före kristus" }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "nittio två före kristus" }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "åtta hundra tre" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "åtta hundra elva" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "fyra hundra femtio fyra" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ett tusen fem" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tio tolv" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tio förtio sex" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "arton noll sju" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "sjutton sjutton" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitton åttio åtta"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "två tusen nio"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo arton"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo tjugo ett"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo trettio"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tjugo ett hundra" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ett tusen" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "två tusen" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trettio ett tjugo före kristus" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trettio två förtio ett före kristus" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "femtio två hundra" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elva hundra" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tjugo ett hundra" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde februari, tjugo arton"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde februari"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "imorgon"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "idag"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "igår"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "söndag, den fjärde februari"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "söndag, den fjärde februari, tjugo arton"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton klockan tjugotvå minuter över ett på eftermiddagen"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton klockan tretton tjugotvå"} + } +} diff --git a/lingua_franca/res/text/sv-se/day.word b/lingua_franca/res/text/sv-se/day.word new file mode 100644 index 0000000..12ad2ae --- /dev/null +++ b/lingua_franca/res/text/sv-se/day.word @@ -0,0 +1 @@ +dag diff --git a/lingua_franca/res/text/sv-se/days.word b/lingua_franca/res/text/sv-se/days.word new file mode 100644 index 0000000..c4dbf3e --- /dev/null +++ b/lingua_franca/res/text/sv-se/days.word @@ -0,0 +1 @@ +dagar diff --git a/lingua_franca/res/text/sv-se/hour.word b/lingua_franca/res/text/sv-se/hour.word new file mode 100644 index 0000000..ba2c26d --- /dev/null +++ b/lingua_franca/res/text/sv-se/hour.word @@ -0,0 +1 @@ +timme diff --git a/lingua_franca/res/text/sv-se/hours.word b/lingua_franca/res/text/sv-se/hours.word new file mode 100644 index 0000000..9a7aac0 --- /dev/null +++ b/lingua_franca/res/text/sv-se/hours.word @@ -0,0 +1 @@ +timmar diff --git a/lingua_franca/res/text/sv-se/minute.word b/lingua_franca/res/text/sv-se/minute.word new file mode 100644 index 0000000..7a715f8 --- /dev/null +++ b/lingua_franca/res/text/sv-se/minute.word @@ -0,0 +1 @@ +minut diff --git a/lingua_franca/res/text/sv-se/minutes.word b/lingua_franca/res/text/sv-se/minutes.word new file mode 100644 index 0000000..19e9917 --- /dev/null +++ b/lingua_franca/res/text/sv-se/minutes.word @@ -0,0 +1 @@ +minuter diff --git a/lingua_franca/res/text/sv-se/second.word b/lingua_franca/res/text/sv-se/second.word new file mode 100644 index 0000000..bbf810e --- /dev/null +++ b/lingua_franca/res/text/sv-se/second.word @@ -0,0 +1 @@ +sekund diff --git a/lingua_franca/res/text/sv-se/seconds.word b/lingua_franca/res/text/sv-se/seconds.word new file mode 100644 index 0000000..5256c15 --- /dev/null +++ b/lingua_franca/res/text/sv-se/seconds.word @@ -0,0 +1 @@ +sekunder diff --git a/lingua_franca/res/text/tr-tr/day.word b/lingua_franca/res/text/tr-tr/day.word new file mode 100644 index 0000000..9832baf --- /dev/null +++ b/lingua_franca/res/text/tr-tr/day.word @@ -0,0 +1 @@ +gün diff --git a/lingua_franca/res/text/tr-tr/days.word b/lingua_franca/res/text/tr-tr/days.word new file mode 100644 index 0000000..ed16a30 --- /dev/null +++ b/lingua_franca/res/text/tr-tr/days.word @@ -0,0 +1 @@ +günler diff --git a/lingua_franca/res/text/tr-tr/hour.word b/lingua_franca/res/text/tr-tr/hour.word new file mode 100644 index 0000000..08bcc92 --- /dev/null +++ b/lingua_franca/res/text/tr-tr/hour.word @@ -0,0 +1 @@ +saat diff --git a/lingua_franca/res/text/tr-tr/hours.word b/lingua_franca/res/text/tr-tr/hours.word new file mode 100644 index 0000000..9f39470 --- /dev/null +++ b/lingua_franca/res/text/tr-tr/hours.word @@ -0,0 +1 @@ +saatler diff --git a/lingua_franca/res/text/tr-tr/minute.word b/lingua_franca/res/text/tr-tr/minute.word new file mode 100644 index 0000000..f84ec6c --- /dev/null +++ b/lingua_franca/res/text/tr-tr/minute.word @@ -0,0 +1 @@ +dakika diff --git a/lingua_franca/res/text/tr-tr/minutes.word b/lingua_franca/res/text/tr-tr/minutes.word new file mode 100644 index 0000000..da54805 --- /dev/null +++ b/lingua_franca/res/text/tr-tr/minutes.word @@ -0,0 +1 @@ +dakikalar diff --git a/lingua_franca/res/text/tr-tr/second.word b/lingua_franca/res/text/tr-tr/second.word new file mode 100644 index 0000000..ec58400 --- /dev/null +++ b/lingua_franca/res/text/tr-tr/second.word @@ -0,0 +1 @@ +saniye diff --git a/lingua_franca/res/text/tr-tr/seconds.word b/lingua_franca/res/text/tr-tr/seconds.word new file mode 100644 index 0000000..aa7cfaa --- /dev/null +++ b/lingua_franca/res/text/tr-tr/seconds.word @@ -0,0 +1 @@ +saniyeler diff --git a/lingua_franca/time.py b/lingua_franca/time.py new file mode 100644 index 0000000..17f46d0 --- /dev/null +++ b/lingua_franca/time.py @@ -0,0 +1,94 @@ +# +# Copyright 2018 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime +from dateutil.tz import gettz, tzlocal + + +__default_tz = None + + +def set_default_tz(tz): + global __default_tz + if isinstance(tz, str): + tz = gettz(tz) + __default_tz = tz + + +def default_timezone(): + """ Get the default timezone + + either a value set by downstream user with + lingua_franca.internal.set_default_tz + or default system value + + Returns: + (datetime.tzinfo): Definition of the default timezone + """ + return __default_tz or tzlocal() + + +def now_utc(): + """ Retrieve the current time in UTC + + Returns: + (datetime): The current time in Universal Time, aka GMT + """ + return to_utc(datetime.utcnow()) + + +def now_local(tz=None): + """ Retrieve the current time + + Args: + tz (datetime.tzinfo, optional): Timezone, default to user's settings + + Returns: + (datetime): The current time + """ + if not tz: + tz = default_timezone() + return datetime.now(tz) + + +def to_utc(dt): + """ Convert a datetime with timezone info to a UTC datetime + + Args: + dt (datetime): A datetime (presumably in some local zone) + Returns: + (datetime): time converted to UTC + """ + tzUTC = gettz("UTC") + if dt.tzinfo: + return dt.astimezone(tzUTC) + else: + return dt.replace(tzinfo=gettz("UTC")).astimezone(tzUTC) + + +def to_local(dt): + """ Convert a datetime to the user's local timezone + + Args: + dt (datetime): A datetime (if no timezone, defaults to UTC) + Returns: + (datetime): time converted to the local timezone + """ + tz = default_timezone() + if dt.tzinfo: + return dt.astimezone(tz) + else: + return dt.replace(tzinfo=gettz("UTC")).astimezone(tz) + diff --git a/plugins/core.py b/plugins/core.py index d751987..bc48c5d 100644 --- a/plugins/core.py +++ b/plugins/core.py @@ -7,7 +7,7 @@ from vacore import VACore def start(core:VACore): manifest = { "name": "Core plugin", - "version": "2.2", + "version": "2.3", "default_options": { "mpcIsUse": True, @@ -19,6 +19,7 @@ def start(core:VACore): "ttsEngineId": "pyttsx", "ttsEngineId2": "", # двиг для прямой озвучки на сервере. Если пуст - используется ttsEngineId "playWavEngineId": "audioplayer", + "linguaFrancaLang": "ru", # язык для библиотеки lingua-franca конвертирования чисел "voiceAssNames": "ирина|ирины|ирину", "logPolicy": "cmd", # all | cmd | none @@ -53,7 +54,8 @@ def start_with_options(core:VACore, manifest:dict): if not os.path.exists(core.tmpdir): os.mkdir(core.tmpdir) - + import lingua_franca + lingua_franca.load_language(options["linguaFrancaLang"]) return manifest diff --git a/plugins_inactive/plugin_tts_silero_v3.py b/plugins_inactive/plugin_tts_silero_v3.py index 47b5840..ed036f8 100644 --- a/plugins_inactive/plugin_tts_silero_v3.py +++ b/plugins_inactive/plugin_tts_silero_v3.py @@ -1,7 +1,7 @@ # TTS plugin for silero engine # author: Vladislav Janvarev -# require torch 1.8+ +# require torch 1.10+ import os @@ -13,13 +13,15 @@ modname = os.path.basename(__file__)[:-3] # calculating modname def start(core:VACore): manifest = { "name": "TTS silero V3", - "version": "1.1", + "version": "1.2", "require_online": False, "default_options": { "speaker": "xenia", "threads": 4, "sample_rate": 24000, + "put_accent": True, + "put_yo": True, }, "tts": { @@ -54,7 +56,9 @@ def init(core:VACore): def towavfile(core:VACore, text_to_speech:str, wavfile:str): - text_to_speech = text_to_speech.replace("…","...") + text_to_speech = text_to_speech.replace("…","...") + text_to_speech = core.all_num_to_text(text_to_speech) + #print(text_to_speech) options = core.plugin_options(modname) @@ -62,10 +66,10 @@ def towavfile(core:VACore, text_to_speech:str, wavfile:str): # рендерим wav path = core.model.save_wav(text=text_to_speech, - speaker=speaker, - put_accent=True, - put_yo=True, - sample_rate=options["sample_rate"]) + speaker=speaker, + put_accent=options["put_accent"], + put_yo=options["put_yo"], + sample_rate=options["sample_rate"]) # перемещаем wav на новое место if os.path.exists(wavfile): diff --git a/utils/all_num_to_text.py b/utils/all_num_to_text.py new file mode 100644 index 0000000..6d8fd0c --- /dev/null +++ b/utils/all_num_to_text.py @@ -0,0 +1,39 @@ +# MIT License +# Janvarev Vladislav +# +# library for translate all digits in text to pronounce + +import re + +#from utils.num_to_text_ru import num2text +from lingua_franca.format import pronounce_number + +def load_language(lang:str): + import lingua_franca + lingua_franca.load_language(lang) + +def convert_one_num_float(match_obj): + if match_obj.group() is not None: + text = str(match_obj.group()) + return pronounce_number(float(match_obj.group())) + +def convert_diapazon(match_obj): + if match_obj.group() is not None: + text = str(match_obj.group()) + text = text.replace("-"," тире ") + return all_num_to_text(text) + + +def all_num_to_text(text:str) -> str: + text = re.sub(r'[\d]*[.][\d]+-[\d]*[.][\d]+', convert_diapazon, text) + text = re.sub(r'-[\d]*[.][\d]+', convert_one_num_float, text) + text = re.sub(r'[\d]*[.][\d]+', convert_one_num_float, text) + text = re.sub(r'[\d]-[\d]+', convert_diapazon, text) + text = re.sub(r'-[\d]+', convert_one_num_float, text) + text = re.sub(r'[\d]+', convert_one_num_float, text) + text = text.replace("%", " процентов") + return text + +if __name__ == "__main__": + load_language("ru") + print(all_num_to_text("Ба ва 120.1-120.8, Да -30.1, Ка 44.05, Га 225. Рынок -10%. Тест")) \ No newline at end of file diff --git a/vacore.py b/vacore.py index 91df5da..625ad7b 100644 --- a/vacore.py +++ b/vacore.py @@ -6,7 +6,7 @@ from threading import Timer from jaa import JaaCore -version = "5.0" +version = "5.1" # main VACore class @@ -185,6 +185,10 @@ class VACore(JaaCore): self.tmpcnt += 1 return self.tmpdir+"/vacore_"+str(self.tmpcnt) + def all_num_to_text(self,text:str): + from utils.all_num_to_text import all_num_to_text + return all_num_to_text(text) + # -------- main function ---------- def execute_next(self,command,context):