pax_global_header00006660000000000000000000000064142621134340014512gustar00rootroot0000000000000052 comment=3c98f8f885014c884412def1675c12e1cdb8b8d5 lingua-franca-release-v0.4.3/000077500000000000000000000000001426211343400160315ustar00rootroot00000000000000lingua-franca-release-v0.4.3/.github/000077500000000000000000000000001426211343400173715ustar00rootroot00000000000000lingua-franca-release-v0.4.3/.github/workflows/000077500000000000000000000000001426211343400214265ustar00rootroot00000000000000lingua-franca-release-v0.4.3/.github/workflows/pr.yml000066400000000000000000000014541426211343400225760ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Pytest on: pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install pytest pip install -r requirements.txt - name: Test with pytest run: | pytest lingua-franca-release-v0.4.3/.github/workflows/push.yml000066400000000000000000000062541426211343400231370ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions # In addition it will tag a release if setup.py is updated with a new version # and publish a release to pypi from the tag name: Python package on: push: branches: [ master ] jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install pytest pip install -r requirements.txt - name: Test with pytest run: | pytest tag-release-if-needed: runs-on: ubuntu-latest outputs: version: ${{ steps.tag.outputs.version }} steps: - uses: actions/checkout@v2 - id: tag name: Tag release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | git remote add tag_target "https://$GITHUB_TOKEN@github.com/MycroftAI/lingua-franca.git" VERSION=$(python setup.py --version) git tag -f release/v$VERSION || exit 0 if git push tag_target --tags; then echo "New tag published on github, push to PyPI as well." pip install twine wheel python setup.py sdist bdist_wheel twine check dist/* twine upload dist/* echo "Package pushed to PyPI. Prepare for mycroft-core PR." echo "::set-output name=version::$VERSION" fi update-mycroft-core: runs-on: ubuntu-latest needs: tag-release-if-needed steps: - uses: actions/checkout@v2 with: repository: MycroftAI/mycroft-core - name: Update mycroft-core env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | VERSION=${{needs.tag-release-if-needed.outputs.version}} if [[ $VERSION != *"."* ]]; then echo "Not a valid version number." exit 1 elif [[ $VERSION == *"-"* ]]; then echo "Pre-release suffix detected. Not pushing to mycroft-core." else sed -E "s/lingua-franca==[0-9]+\.[0-9]+\.[0-9]+/lingua-franca==$VERSION/" requirements/requirements.txt > tmp-requirements.txt mv tmp-requirements.txt requirements/requirements.txt echo "LINGUA_FRANCA_VERSION=$VERSION" >> $GITHUB_ENV fi - name: Create Pull Request if: ${{ env.LINGUA_FRANCA_VERSION }} uses: peter-evans/create-pull-request@v3 with: token: ${{ secrets.BOT_TOKEN }} push-to-fork: mycroft-adapt-bot/mycroft-core commit-message: Update Lingua Franca to v${{ env.LINGUA_FRANCA_VERSION }} branch: feature/update-lingua-franca delete-branch: true title: Update Lingua Franca to v${{ env.LINGUA_FRANCA_VERSION }} body: Automated update from mycroftai/lingua-franca. lingua-franca-release-v0.4.3/.gitignore000066400000000000000000000023451426211343400200250ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # VSCod(e/ium) .vscode/ vscode/ *.code-workspace lingua-franca-release-v0.4.3/Jenkinsfile000066400000000000000000000012321426211343400202130ustar00rootroot00000000000000pipeline { agent any options { buildDiscarder(logRotator(numToKeepStr: '5')) } stages { // Run the build in the against the dev branch to check for compile errors stage('Add CLA label to PR') { environment { //spawns GITHUB_USR and GITHUB_PSW environment variables GITHUB=credentials('38b2e4a6-167a-40b2-be6f-d69be42c8190') } steps { // Using an install of Github repo CLA tagger // (https://github.com/forslund/github-repo-cla) sh '~/github-repo-cla/mycroft-core-cla-check.sh' } } } } lingua-franca-release-v0.4.3/LICENSE000066400000000000000000000261351426211343400170450ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. lingua-franca-release-v0.4.3/MANIFEST.in000066400000000000000000000001741426211343400175710ustar00rootroot00000000000000include LICENSE include readme.md include requirements.txt recursive-include test *.py recursive-include lingua_franca/res *lingua-franca-release-v0.4.3/examples/000077500000000000000000000000001426211343400176475ustar00rootroot00000000000000lingua-franca-release-v0.4.3/examples/extract_stuff.py000066400000000000000000000055501426211343400231070ustar00rootroot00000000000000from lingua_franca.parse import extract_datetime, extract_number, \ extract_numbers, extract_duration, normalize # extract a number assert extract_number("nothing") is False assert extract_number("two million five hundred thousand tons of spinning " "metal") == 2500000 assert extract_number("six trillion") == 6000000000000.0 assert extract_number("six trillion", short_scale=False) == 6e+18 assert extract_number("1 and 3/4 cups") == 1.75 assert extract_number("1 cup and a half") == 1.5 ## extracts all numbers assert extract_numbers("nothing") == [] assert extract_numbers("this is a one twenty one test") == [1.0, 21.0] assert extract_numbers("1 dog, seven pigs, macdonald had a farm, " "3 times 5 macarena") == [1, 7, 3, 5] ## extract durations from datetime import timedelta assert extract_duration("nothing") == (None, 'nothing') assert extract_duration("Nineteen minutes past the hour") == ( timedelta(minutes=19), "past the hour") assert extract_duration("wake me up in three weeks, four hundred ninety seven" " days, and three hundred 91.6 seconds") == ( timedelta(weeks=3, days=497, seconds=391.6), "wake me up in , , and") assert extract_duration( "The movie is one hour, fifty seven and a half minutes long") == ( timedelta(hours=1, minutes=57.5), "the movie is , long") ## extract date times from datetime import datetime def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) assert res[0] == expected_date assert res[1] == expected_leftover testExtract("now is the time", "2017-06-27 13:04:00", "is time") testExtract("in a couple minutes", "2017-06-27 13:06:00", "") testExtract("What is the day after tomorrow's weather?", "2017-06-29 00:00:00", "what is weather") testExtract("Remind me at 10:45 pm", "2017-06-27 22:45:00", "remind me") testExtract("what is the weather on friday morning", "2017-06-30 08:00:00", "what is weather") testExtract("what is tomorrow's weather", "2017-06-28 00:00:00", "what is weather") testExtract("remind me to call mom next tuesday", "2017-07-04 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 3 weeks", "2017-07-18 00:00:00", "remind me to call mom") testExtract("set an alarm for tonight 9:30", "2017-06-27 21:30:00", "set alarm") testExtract("on the evening of june 5th 2017 remind me to call my mother", "2017-06-05 19:00:00", "remind me to call my mother") lingua-franca-release-v0.4.3/examples/pronounce_stuff.py000066400000000000000000000053001426211343400234360ustar00rootroot00000000000000from lingua_franca.format import nice_duration, nice_date, nice_date_time, \ nice_number, nice_time, pronounce_number # pronounce numbers assert nice_number(25/6) == "4 and a sixth" assert nice_number(201) == "201" assert nice_number(3.14159269) == "3 and a seventh" assert pronounce_number(3.14159269) == "three point one four" assert pronounce_number(0) == "zero" assert pronounce_number(10) == "ten" assert pronounce_number(201) == "two hundred and one" assert pronounce_number(102.3) == "one hundred and two point three" assert pronounce_number( 4092949192) == "four billion, ninety two million, nine hundred and forty nine thousand, one hundred and ninety two" assert pronounce_number(100034000000299792458, short_scale=True) == \ "one hundred quintillion, thirty four quadrillion, " \ "two hundred and ninety nine million, seven hundred and ninety " \ "two thousand, four hundred and fifty eight" assert pronounce_number(100034000000299792458, short_scale=False) == \ "one hundred trillion, thirty four thousand billion, " \ "two hundred and ninety nine million, seven hundred and ninety " \ "two thousand, four hundred and fifty eight" # pronounce datetime objects import datetime dt = datetime.datetime(2017, 1, 31, 13, 22, 3) assert nice_date(dt) == "tuesday, january thirty-first, twenty seventeen" assert nice_time(dt) == "one twenty two" assert nice_time(dt, use_ampm=True) == "one twenty two p.m." assert nice_time(dt, speech=False) == "1:22" assert nice_time(dt, speech=False, use_ampm=True) == "1:22 PM" assert nice_time(dt, speech=False, use_24hour=True) == "13:22" assert nice_time(dt, speech=False, use_24hour=True, use_ampm=True) == "13:22" assert nice_time(dt, use_24hour=True, use_ampm=True) == "thirteen twenty two" assert nice_time(dt, use_24hour=True, use_ampm=False) == "thirteen twenty two" assert nice_date_time(dt) == "tuesday, january thirty-first, twenty seventeen at one twenty two" # pronounce durations assert nice_duration(1) == "one second" assert nice_duration(3) == "three seconds" assert nice_duration(1, speech=False) == "0:01" assert nice_duration(61), "one minute one second" assert nice_duration(61, speech=False) == "1:01" assert nice_duration(5000) == "one hour twenty three minutes twenty seconds" assert nice_duration(5000, speech=False), "1:23:20" assert nice_duration(50000) == "thirteen hours fifty three minutes twenty seconds" assert nice_duration(50000, speech=False) == "13:53:20" assert nice_duration(500000) == "five days eighteen hours fifty three minutes twenty seconds" assert nice_duration(500000, speech=False), "5d 18:53:20" assert nice_duration(datetime.timedelta(seconds=500000), speech=False) == "5d 18:53:20"lingua-franca-release-v0.4.3/lingua_franca/000077500000000000000000000000001426211343400206225ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/__init__.py000066400000000000000000000004741426211343400227400ustar00rootroot00000000000000from .internal import get_default_lang, set_default_lang, get_default_loc, \ get_active_langs, _set_active_langs, get_primary_lang_code, \ get_full_lang_code, resolve_resource_file, load_language, \ load_languages, unload_language, unload_languages, get_supported_langs from lingua_franca import config lingua-franca-release-v0.4.3/lingua_franca/bracket_expansion.py000066400000000000000000000127611426211343400247020ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class Fragment(object): """(Abstract) empty sentence fragment""" def __init__(self, tree): """ Construct a sentence tree fragment which is merely a wrapper for a list of Strings Args: tree (?): Base tree for the sentence fragment, type depends on subclass, refer to those subclasses """ self._tree = tree def tree(self): """Return the represented sentence tree as raw data.""" return self._tree def expand(self): """ Expanded version of the fragment. In this case an empty sentence. Returns: List>: A list with an empty sentence (= token/string list) """ return [[]] def __str__(self): return self._tree.__str__() def __repr__(self): return self._tree.__repr__() class Word(Fragment): """ Single word in the sentence tree. Construct with a string as argument. """ def expand(self): """ Creates one sentence that contains exactly that word. Returns: List>: A list with the given string as sentence (= token/string list) """ return [[self._tree]] class Sentence(Fragment): """ A Sentence made of several concatenations/words. Construct with a List as argument. """ def expand(self): """ Creates a combination of all sub-sentences. Returns: List>: A list with all subsentence expansions combined in every possible way """ old_expanded = [[]] for sub in self._tree: sub_expanded = sub.expand() new_expanded = [] while len(old_expanded) > 0: sentence = old_expanded.pop() for new in sub_expanded: new_expanded.append(sentence + new) old_expanded = new_expanded return old_expanded class Options(Fragment): """ A Combination of possible sub-sentences. Construct with List as argument. """ def expand(self): """ Returns all of its options as seperated sub-sentences. Returns: List>: A list containing the sentences created by all expansions of its sub-sentences """ options = [] for option in self._tree: options.extend(option.expand()) return options class SentenceTreeParser(object): """ Generate sentence token trees from a list of tokens ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] """ def __init__(self, tokens): self.tokens = tokens def _parse(self): """ Generate sentence token trees ['1', '(', '2', '|', '3, ')'] -> ['1', ['2', '3']] """ self._current_position = 0 return self._parse_expr() def _parse_expr(self): """ Generate sentence token trees from the current position to the next closing parentheses / end of the list and return it ['1', '(', '2', '|', '3, ')'] -> ['1', [['2'], ['3']]] ['2', '|', '3'] -> [['2'], ['3']] """ # List of all generated sentences sentence_list = [] # Currently active sentence cur_sentence = [] sentence_list.append(Sentence(cur_sentence)) # Determine which form the current expression has while self._current_position < len(self.tokens): cur = self.tokens[self._current_position] self._current_position += 1 if cur == '(': # Parse the subexpression subexpr = self._parse_expr() # Check if the subexpression only has one branch # -> If so, append "(" and ")" and add it as is normal_brackets = False if len(subexpr.tree()) == 1: normal_brackets = True cur_sentence.append(Word('(')) # add it to the sentence cur_sentence.append(subexpr) if normal_brackets: cur_sentence.append(Word(')')) elif cur == '|': # Begin parsing a new sentence cur_sentence = [] sentence_list.append(Sentence(cur_sentence)) elif cur == ')': # End parsing the current subexpression break # TODO anything special about {sth}? else: cur_sentence.append(Word(cur)) return Options(sentence_list) def _expand_tree(self, tree): """ Expand a list of sub sentences to all combinated sentences. ['1', ['2', '3']] -> [['1', '2'], ['1', '3']] """ return tree.expand() def expand_parentheses(self): tree = self._parse() return self._expand_tree(tree)lingua-franca-release-v0.4.3/lingua_franca/config.py000066400000000000000000000000651426211343400224420ustar00rootroot00000000000000load_langs_on_demand = False inject_timezones = True lingua-franca-release-v0.4.3/lingua_franca/format.py000077500000000000000000000553371426211343400225040ustar00rootroot00000000000000# Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import datetime import json import os import re from collections import namedtuple from warnings import warn from os.path import join from lingua_franca.bracket_expansion import SentenceTreeParser from lingua_franca.internal import localized_function, \ populate_localized_function_dict, get_active_langs, \ get_full_lang_code, get_default_lang, get_default_loc, \ is_supported_full_lang, _raise_unsupported_language, \ UnsupportedLanguageError, NoneLangWarning, InvalidLangWarning, \ FunctionNotLocalizedError from lingua_franca.time import now_local, to_utc _REGISTERED_FUNCTIONS = ("nice_number", "nice_time", "pronounce_number", "nice_response", "nice_duration", "nice_relative_time") populate_localized_function_dict("format", langs=get_active_langs()) def _translate_word(name, lang=''): """ Helper to get word translations Args: name (str): Word name. Returned as the default value if not translated lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: str: translated version of resource name """ from lingua_franca.internal import resolve_resource_file if not lang: if lang is None: warn(NoneLangWarning) lang = get_default_loc() lang_code = lang if is_supported_full_lang(lang) else \ get_full_lang_code(lang) filename = resolve_resource_file(join("text", lang_code, name + ".word")) if filename: # open the file try: with open(filename, 'r', encoding='utf8') as f: for line in f: word = line.strip() if word.startswith("#"): continue # skip comment lines return word except Exception: pass return name # use resource name as the word NUMBER_TUPLE = namedtuple( 'number', ('x, xx, x0, x_in_x0, xxx, x00, x_in_x00, xx00, xx_in_xx00, x000, ' + 'x_in_x000, x0_in_x000, x_in_0x00')) class DateTimeFormat: def __init__(self, config_path): self.lang_config = {} self.config_path = config_path def cache(self, lang): if lang not in self.lang_config: try: # Attempt to load the language-specific formatting data with open(self.config_path + '/' + lang + '/date_time.json', 'r', encoding='utf8') as lang_config_file: self.lang_config[lang] = json.loads( lang_config_file.read()) except FileNotFoundError: # Fallback to English formatting with open(self.config_path + '/en-us/date_time.json', 'r') as lang_config_file: self.lang_config[lang] = json.loads( lang_config_file.read()) for x in ['decade_format', 'hundreds_format', 'thousand_format', 'year_format']: i = 1 while self.lang_config[lang][x].get(str(i)): self.lang_config[lang][x][str(i)]['re'] = ( re.compile(self.lang_config[lang][x][str(i)]['match'] )) i = i + 1 def _number_strings(self, number, lang): x = (self.lang_config[lang]['number'].get(str(number % 10)) or str(number % 10)) xx = (self.lang_config[lang]['number'].get(str(number % 100)) or str(number % 100)) x_in_x0 = self.lang_config[lang]['number'].get( str(int(number % 100 / 10))) or str(int(number % 100 / 10)) x0 = (self.lang_config[lang]['number'].get( str(int(number % 100 / 10) * 10)) or str(int(number % 100 / 10) * 10)) xxx = (self.lang_config[lang]['number'].get(str(number % 1000)) or str(number % 1000)) x00 = (self.lang_config[lang]['number'].get(str(int( number % 1000 / 100) * 100)) or str(int(number % 1000 / 100) * 100)) x_in_x00 = self.lang_config[lang]['number'].get(str(int( number % 1000 / 100))) or str(int(number % 1000 / 100)) xx00 = self.lang_config[lang]['number'].get(str(int( number % 10000 / 100) * 100)) or str(int(number % 10000 / 100) * 100) xx_in_xx00 = self.lang_config[lang]['number'].get(str(int( number % 10000 / 100))) or str(int(number % 10000 / 100)) x000 = (self.lang_config[lang]['number'].get(str(int( number % 10000 / 1000) * 1000)) or str(int(number % 10000 / 1000) * 1000)) x_in_x000 = self.lang_config[lang]['number'].get(str(int( number % 10000 / 1000))) or str(int(number % 10000 / 1000)) x0_in_x000 = self.lang_config[lang]['number'].get(str(int( number % 10000 / 1000) * 10)) or str(int(number % 10000 / 1000) * 10) x_in_0x00 = self.lang_config[lang]['number'].get(str(int( number % 1000 / 100)) or str(int(number % 1000 / 100))) return NUMBER_TUPLE( x, xx, x0, x_in_x0, xxx, x00, x_in_x00, xx00, xx_in_xx00, x000, x_in_x000, x0_in_x000, x_in_0x00) def _format_string(self, number, format_section, lang): s = self.lang_config[lang][format_section]['default'] i = 1 while self.lang_config[lang][format_section].get(str(i)): e = self.lang_config[lang][format_section][str(i)] if e['re'].match(str(number)): return e['format'] i = i + 1 return s def _decade_format(self, number, number_tuple, lang): s = self._format_string(number % 100, 'decade_format', lang) return s.format(x=number_tuple.x, xx=number_tuple.xx, x0=number_tuple.x0, x_in_x0=number_tuple.x_in_x0, number=str(number % 100)) def _number_format_hundreds(self, number, number_tuple, lang, formatted_decade): s = self._format_string(number % 1000, 'hundreds_format', lang) return s.format(xxx=number_tuple.xxx, x00=number_tuple.x00, x_in_x00=number_tuple.x_in_x00, formatted_decade=formatted_decade, number=str(number % 1000)) def _number_format_thousand(self, number, number_tuple, lang, formatted_decade, formatted_hundreds): s = self._format_string(number % 10000, 'thousand_format', lang) return s.format(x_in_x00=number_tuple.x_in_x00, xx00=number_tuple.xx00, xx_in_xx00=number_tuple.xx_in_xx00, x000=number_tuple.x000, x_in_x000=number_tuple.x_in_x000, x0_in_x000=number_tuple.x0_in_x000, x_in_0x00=number_tuple.x_in_0x00, formatted_decade=formatted_decade, formatted_hundreds=formatted_hundreds, number=str(number % 10000)) def date_format(self, dt, lang, now): format_str = 'date_full' if now: if dt.year == now.year: format_str = 'date_full_no_year' if dt.month == now.month and dt.day > now.day: format_str = 'date_full_no_year_month' tomorrow = now + datetime.timedelta(days=1) yesterday = now - datetime.timedelta(days=1) if tomorrow.date() == dt.date(): format_str = 'tomorrow' elif now.date() == dt.date(): format_str = 'today' elif yesterday.date() == dt.date(): format_str = 'yesterday' return self.lang_config[lang]['date_format'][format_str].format( weekday=self.lang_config[lang]['weekday'][str(dt.weekday())], month=self.lang_config[lang]['month'][str(dt.month)], day=self.lang_config[lang]['date'][str(dt.day)], formatted_year=self.year_format(dt, lang, False)) def date_time_format(self, dt, lang, now, use_24hour, use_ampm): date_str = self.date_format(dt, lang, now) time_str = nice_time(dt, lang, use_24hour=use_24hour, use_ampm=use_ampm) return self.lang_config[lang]['date_time_format']['date_time'].format( formatted_date=date_str, formatted_time=time_str) def year_format(self, dt, lang, bc): number_tuple = self._number_strings(dt.year, lang) formatted_bc = ( self.lang_config[lang]['year_format']['bc'] if bc else '') formatted_decade = self._decade_format( dt.year, number_tuple, lang) formatted_hundreds = self._number_format_hundreds( dt.year, number_tuple, lang, formatted_decade) formatted_thousand = self._number_format_thousand( dt.year, number_tuple, lang, formatted_decade, formatted_hundreds) s = self._format_string(dt.year, 'year_format', lang) return re.sub(' +', ' ', s.format( year=str(dt.year), century=str(int(dt.year / 100)), decade=str(dt.year % 100), formatted_hundreds=formatted_hundreds, formatted_decade=formatted_decade, formatted_thousand=formatted_thousand, bc=formatted_bc)).strip() date_time_format = DateTimeFormat(os.path.join(os.path.dirname(__file__), 'res/text')) @localized_function(run_own_code_on=[UnsupportedLanguageError]) def nice_number(number, lang='', speech=True, denominators=None): """Format a float to human readable functions This function formats a float to human understandable functions. Like 4.5 becomes 4 and a half for speech and 4 1/2 for text Args: number (int or float): the float to format lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ return str(number) @localized_function() def nice_time(dt, lang='', speech=True, use_24hour=False, use_ampm=False, variant=None): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. speech (bool): format for speech (default/True) or display (False) use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format variant (string): alternative time system to be used, string must match language specific mappings Returns: (str): The formatted time string """ @localized_function() def pronounce_number(number, lang='', places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5' would be 'five' Args: number: the number to pronounce lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. places (int): number of decimal places to express, default 2 short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool) : convert and pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ def nice_date(dt, lang='', now=None): """ Format a datetime to a pronounceable date For example, generates 'tuesday, june the fifth, 2018' Args: dt (datetime): date to format (assumes already in local timezone) lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. now (datetime): Current date. If provided, the returned date for speech will be shortened accordingly: No year is returned if now is in the same year as td, no month is returned if now is in the same month as td. If now and td is the same day, 'today' is returned. Returns: (str): The formatted date string """ full_code = get_full_lang_code(lang) date_time_format.cache(full_code) return date_time_format.date_format(dt, full_code, now) def nice_date_time(dt, lang='', now=None, use_24hour=False, use_ampm=False): """ Format a datetime to a pronounceable date and time For example, generate 'tuesday, june the fifth, 2018 at five thirty' Args: dt (datetime): date to format (assumes already in local timezone) lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. now (datetime): Current date. If provided, the returned date for speech will be shortened accordingly: No year is returned if now is in the same year as td, no month is returned if now is in the same month as td. If now and td is the same day, 'today' is returned. use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted date time string """ full_code = get_full_lang_code(lang) date_time_format.cache(full_code) return date_time_format.date_time_format(dt, full_code, now, use_24hour, use_ampm) def nice_year(dt, lang='', bc=False): """ Format a datetime to a pronounceable year For example, generate 'nineteen-hundred and eighty-four' for year 1984 Args: dt (datetime): date to format (assumes already in local timezone) lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. bc (bool) pust B.C. after the year (python does not support dates B.C. in datetime) Returns: (str): The formatted year string """ full_code = get_full_lang_code(lang) date_time_format.cache(full_code) return date_time_format.year_format(dt, full_code, bc) @localized_function(run_own_code_on=[FunctionNotLocalizedError]) def nice_duration(duration, lang='', speech=True): """ Convert duration in seconds to a nice spoken timespan Examples: duration = 60 -> "1:00" or "one minute" duration = 163 -> "2:43" or "two minutes forty three seconds" Args: duration: time, in seconds lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. speech (bool): format for speech (True) or display (False) Returns: str: timespan as a string """ if not lang: if lang is None: warn(NoneLangWarning) lang = get_default_loc() if not is_supported_full_lang(lang): # TODO deprecated; delete when 'lang=None' and 'lang=invalid' are # removed try: lang = get_full_lang_code(lang) except UnsupportedLanguageError: warn(InvalidLangWarning) lang = get_default_loc() if isinstance(duration, datetime.timedelta): duration = duration.total_seconds() # Do traditional rounding: 2.5->3, 3.5->4, plus this # helps in a few cases of where calculations generate # times like 2:59:59.9 instead of 3:00. duration += 0.5 days = int(duration // 86400) hours = int(duration // 3600 % 24) minutes = int(duration // 60 % 60) seconds = int(duration % 60) if speech: out = "" if days > 0: out += pronounce_number(days, lang) + " " if days == 1: out += _translate_word("day", lang) else: out += _translate_word("days", lang) out += " " if hours > 0: if out: out += " " out += pronounce_number(hours, lang) + " " if hours == 1: out += _translate_word("hour", lang) else: out += _translate_word("hours", lang) if minutes > 0: if out: out += " " out += pronounce_number(minutes, lang) + " " if minutes == 1: out += _translate_word("minute", lang) else: out += _translate_word("minutes", lang) if seconds > 0: if out: out += " " out += pronounce_number(seconds, lang) + " " if seconds == 1: out += _translate_word("second", lang) else: out += _translate_word("seconds", lang) else: # M:SS, MM:SS, H:MM:SS, Dd H:MM:SS format out = "" if days > 0: out = str(days) + "d " if hours > 0 or days > 0: out += str(hours) + ":" if minutes < 10 and (hours > 0 or days > 0): out += "0" out += str(minutes) + ":" if seconds < 10: out += "0" out += str(seconds) return out def join_list(items, connector, sep=None, lang=''): """ Join a list into a phrase using the given connector word Examples: join_list([1,2,3], "and") -> "1, 2 and 3" join_list([1,2,3], "and", ";") -> "1; 2 and 3" Args: items (array): items to be joined connector (str): connecting word (resource name), like "and" or "or" sep (str, optional): separator character, default = "," lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: str: the connected list phrase """ if not items: return "" if len(items) == 1: return str(items[0]) if not sep: sep = ", " else: sep += " " return (sep.join(str(item) for item in items[:-1]) + " " + _translate_word(connector, lang) + " " + items[-1]) def expand_parentheses(sent): """ ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] For example: Will it (rain|pour) (today|tomorrow|)? ----> Will it rain today? Will it rain tomorrow? Will it rain? Will it pour today? Will it pour tomorrow? Will it pour? Args: sent (list): List of tokens in sentence Returns: list>: Multiple possible sentences from original """ return SentenceTreeParser(sent).expand_parentheses() def expand_options(parentheses_line: str) -> list: """ Convert 'test (a|b)' -> ['test a', 'test b'] Args: parentheses_line: Input line to expand Returns: List of expanded possibilities """ # 'a(this|that)b' -> [['a', 'this', 'b'], ['a', 'that', 'b']] options = expand_parentheses(re.split(r'([(|)])', parentheses_line)) return [re.sub(r'\s+', ' ', ' '.join(i)).strip() for i in options] @localized_function() def nice_response(text, lang=''): """ In some languages, sanitizes certain numeric input for TTS Most of the time, this function will be called by any formatters which might need it. It's exposed here just in case you've got a clever use. As of July 2020, this function sanitizes some dates and "x ^ y"-formatted exponents in the following primary language codes: da de nl sv Args: text (str): input text to sanitize lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Example: assertEqual(nice_response_de("dies ist der 31. mai"), "dies ist der einunddreißigste mai") assertEqual(nice_response_de("10 ^ 2"), "10 hoch 2") """ @localized_function(run_own_code_on=[FunctionNotLocalizedError]) def nice_relative_time(when, relative_to=None, lang=None): """Create a relative phrase to roughly describe the period between two datetimes. Examples are "25 seconds", "tomorrow", "7 days". Note: The reported period is currently limited to a number of days. Longer periods such as multiple weeks or months will be reported in days. Args: when (datetime): Local timezone relative_to (datetime): Baseline for relative time, default is now() lang (str, optional): Defaults to "en-us". Returns: str: Relative description of the given time """ if relative_to: now = relative_to else: now = now_local() delta = to_utc(when) - to_utc(now) if delta.total_seconds() < 1: return "now" if delta.total_seconds() < 90: if delta.total_seconds() == 1: return "one second" else: return "{} seconds".format(int(delta.total_seconds())) minutes = int((delta.total_seconds() + 30) // 60) # +30 to round minutes if minutes < 90: if minutes == 1: return "one minute" else: return "{} minutes".format(minutes) hours = int((minutes + 30) // 60) # +30 to round hours if hours < 36: if hours == 1: return "one hour" else: return "{} hours".format(hours) # TODO: "2 weeks", "3 months", "4 years", etc days = int((hours + 12) // 24) # +12 to round days if days == 1: return "1 day" else: return "{} days".format(days) lingua-franca-release-v0.4.3/lingua_franca/internal.py000066400000000000000000000713531426211343400230210ustar00rootroot00000000000000import os.path from functools import wraps from importlib import import_module from inspect import signature from warnings import warn from datetime import datetime from lingua_franca import config from lingua_franca.time import to_local _SUPPORTED_LANGUAGES = ("ca", "cs", "da", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa", "eu") _SUPPORTED_FULL_LOCALIZATIONS = ("ca-es", "cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", "pl-pl", "fa-ir", "pt-pt", "ru-ru", "sl-si", "sv-se", "tr-tr", "eu-eu") _DEFAULT_FULL_LANG_CODES = {'ca': 'ca-es', 'cs': 'cs-cz', 'da': 'da-dk', 'de': 'de-de', 'en': 'en-us', 'es': 'es-es', 'eu': 'eu-eu', 'fa': 'fa-ir', 'fr': 'fr-fr', 'hu': 'hu-hu', 'it': 'it-it', 'nl': 'nl-nl', 'pl': 'pl-pl', 'pt': 'pt-pt', 'ru': 'ru-ru', 'sl': 'sl-si', 'sv': 'sv-se', 'tr': 'tr-tr'} __default_lang = None __active_lang_code = None __loaded_langs = [] _localized_functions = {} # TODO the deprecation of 'lang=None' and 'lang=' can refer to # commit 35efd0661a178e82f6745ad17e10e607c0d83472 for the "proper" state # of affairs, raising the errors below instead of deprecation warnings # Once the deprecation is complete, functions which have had their default # parameter changed from lang=None to lang='' should be switched back class UnsupportedLanguageError(NotImplementedError): pass class FunctionNotLocalizedError(NotImplementedError): pass NoneLangWarning = \ DeprecationWarning("Lingua Franca is dropping support" " for 'lang=None' as an explicit" " argument.") InvalidLangWarning = \ DeprecationWarning("Invalid language code detected. Falling back on " "default.\nThis behavior is deprecated. The 'lang' " "parameter is optional, and only accepts supported " "language codes, beginning with Lingua Franca 0.3.0") def _raise_unsupported_language(language): """ Raise an error when a language is unsupported Arguments: language: str The language that was supplied. """ supported = ' '.join(_SUPPORTED_LANGUAGES) raise UnsupportedLanguageError("\nLanguage '{language}' is not yet " "supported by Lingua Franca. " "Supported language codes " "include the following:\n{supported}" .format(language=language, supported=supported)) def get_supported_langs(): """ Returns: list(str) """ return _SUPPORTED_LANGUAGES def get_active_langs(): """ Get the list of currently-loaded language codes Returns: list(str) """ return __loaded_langs def _set_active_langs(langs=None, override_default=True): """ Set the list of languages to load. Unloads previously-loaded languages which are not specified here. If the input list does not contain the current default language, langs[0] will become the new default language. This behavior can be overridden. Arguments: langs: {list(str) or str} -- a list of language codes to load Keyword Arguments: override_default (bool) -- Change default language to first entry if the current default is no longer present (default: True) """ if isinstance(langs, str): langs = [langs] if not isinstance(langs, list): raise(TypeError("lingua_franca.internal._set_active_langs expects" " 'str' or 'list'")) global __loaded_langs, __default_lang __loaded_langs = list(dict.fromkeys(langs)) if __default_lang: if override_default or get_primary_lang_code(__default_lang) \ not in __loaded_langs: if len(__loaded_langs): set_default_lang(get_full_lang_code(__loaded_langs[0])) else: __default_lang = None _refresh_function_dict() def _refresh_function_dict(): for mod in _localized_functions.keys(): populate_localized_function_dict(mod, langs=__loaded_langs) def is_supported_lang(lang): try: return lang.lower() in _SUPPORTED_LANGUAGES except AttributeError: return False def is_supported_full_lang(lang): """ Arguments: lang (str): a full language code, such as "en-US" (case insensitive) Returns: bool - does Lingua Franca support this language code? """ try: return lang.lower() in _SUPPORTED_FULL_LOCALIZATIONS except AttributeError: return False def load_language(lang): """Load `lang` and its functions into memory. Will only import those functions which belong to a loaded module. In other words, if you have lingua_franca.parse loaded, but *not* lingua_franca.format, running `load_language('es') will only import the Spanish-language parsers, and not the formatters. The reverse is also true: importing a module, such as `import lingua_franca.parse`, will only import those functions which belong to currently-loaded languages. Arguments: lang (str): the language code to load (any supported lang code, whether 'primary' or 'full') Case-insensitive. """ if not isinstance(lang, str): raise TypeError("lingua_franca.load_language expects 'str' " "(got " + type(lang) + ")") if lang not in _SUPPORTED_LANGUAGES: if lang in _SUPPORTED_FULL_LOCALIZATIONS: lang = get_primary_lang_code(lang) if lang not in __loaded_langs: __loaded_langs.append(lang) if not __default_lang: set_default_lang(lang) _set_active_langs(__loaded_langs) def load_languages(langs): """Load multiple languages at once Simple for loop using load_language() Args: langs (list[str]) """ for lang in langs: load_language(lang) def unload_language(lang): """Opposite of load_language() Unloading the default causes the next language in `lingua_franca.get_active_langs()` to become the default. Will not stop you from unloading the last language, as this may be desirable for some applications. Args: lang (str): language code to unload """ if lang in __loaded_langs: __loaded_langs.remove(lang) _set_active_langs(__loaded_langs) def unload_languages(langs): """Opposite of load_languages() Simple for loop using unload_language() Args: langs (list[str]) """ for lang in langs: __loaded_langs.remove(lang) _set_active_langs(__loaded_langs) def get_default_lang(): """ Return the current default language. This returns the active BCP-47 code, such as 'en' or 'es'. For the current localization/full language code, such as 'en-US' or 'es-ES', call `get_default_loc()` See: https://en.wikipedia.org/wiki/IETF_language_tag Returns: str: A primary language code, e.g. ("en", or "pt") """ return __default_lang def get_default_loc(): """ Return the current, localized BCP-47 language code, such as 'en-US' or 'es-ES'. For the default language *family* - which is passed to most parsers and formatters - call `get_default_lang` The 'localized' portion conforms to ISO 3166-1 alpha-2 https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 """ return __active_lang_code def set_default_lang(lang_code): """ Set the active BCP-47 language code to be used in formatting/parsing Will choose a default localization if passed a primary language family (ex: `set_default_lang("en")` will default to "en-US") Will respect localization when passed a full lang code. For more information about valid lang codes, see get_default_lang() and get_default_loc() Args: lang(str): BCP-47 language code, e.g. "en-us" or "es-mx" """ global __default_lang, __active_lang_code lang_code = lang_code.lower() primary_lang_code = get_primary_lang_code(lang_code) if primary_lang_code not in _SUPPORTED_LANGUAGES: _raise_unsupported_language(lang_code) else: __default_lang = primary_lang_code # make sure the default language is loaded. # also make sure the default language is at the front. # position doesn't matter here, but it clarifies things while debugging. if __default_lang in __loaded_langs: __loaded_langs.remove(__default_lang) __loaded_langs.insert(0, __default_lang) _refresh_function_dict() if is_supported_full_lang(lang_code): __active_lang_code = lang_code else: __active_lang_code = get_full_lang_code(__default_lang) # TODO remove this when invalid lang codes are removed (currently deprecated) def get_primary_lang_code(lang=''): if not lang: if lang is None: warn(NoneLangWarning) lang = get_default_loc() # if not (lang): try: lang = __get_primary_lang_code_deprecation_warning(lang) except UnsupportedLanguageError: warn(InvalidLangWarning) lang = get_default_loc() return lang def __get_primary_lang_code_deprecation_warning(lang=''): """ Get the primary language code Args: lang(str, optional): A BCP-47 language code (If omitted, equivalent to `lingua_franca.get_default_lang()`) Returns: str: A primary language family, such as "en", "de" or "pt" """ # split on the hyphen and only return the primary-language code # NOTE: This is typically a two character code. The standard allows # 1, 2, 3 and 4 character codes. In the future we can consider # mapping from the 3 to 2 character codes, for example. But for # now we can just be careful in use. if not lang: return get_default_lang() elif not isinstance(lang, str): raise(TypeError("lingua_franca.get_primary_lang_code() expects" " an (optional)argument of type 'str', but got " + type(lang))) else: lang_code = lang.lower() if lang_code not in _SUPPORTED_FULL_LOCALIZATIONS and lang_code not in \ _SUPPORTED_LANGUAGES: # We don't know this language code. Check if the input is # formatted like a language code. if lang == (("-".join([lang[:2], lang[3:]]) or None)): warn("Unrecognized language code: '" + lang + "', but it appears " "to be a valid language code. Returning the first two chars.") return lang_code.split("-")[0] else: raise(ValueError("Invalid input: " + lang)) return lang_code.split("-")[0] # TODO remove this when invalid lang codes are removed (currently deprecated) def get_full_lang_code(lang=''): if not lang: if lang is None: warn(NoneLangWarning) lang = get_default_loc() if not is_supported_full_lang(lang): try: lang = __get_full_lang_code_deprecation_warning(lang) except UnsupportedLanguageError: warn(InvalidLangWarning) lang = get_default_loc() return lang def __get_full_lang_code_deprecation_warning(lang=''): """ Get the full language code Args: lang(str, optional): A BCP-47 language code (if omitted, equivalent to `lingua_franca.get_default_loc()`) Returns: str: A full language code, such as "en-us" or "de-de" """ if lang is None: return __active_lang_code.lower() elif not isinstance(lang, str): raise TypeError("get_full_lang_code expects str, " "got {}".format(type(lang))) if lang.lower() in _SUPPORTED_FULL_LOCALIZATIONS: return lang elif lang in _DEFAULT_FULL_LANG_CODES: return _DEFAULT_FULL_LANG_CODES[lang] else: raise UnsupportedLanguageError(lang) def localized_function(run_own_code_on=[type(None)]): """ Decorator which finds localized functions, and calls them, from signatures defined in the top-level modules. See lingua_franca.format or .parse for examples of the decorator in action. Note that, by default, wrapped functions will never actually be executed. Rather, when they're called, their arguments will be passed directly to their localized equivalent, specified by the 'lang' parameter. The wrapper can be instructed to execute the wrapped function itself when a specified error is raised (see the argument 'run_own_code_on') For instance, this decorator wraps parse.extract_number(), which has no logic of its own. A call to extract_number('uno', lang='es') will locate and call lingua_franca.lang.parse_es.extract_number_es('uno') By contrast, here's the decorator above format.nice_number, with the param: @localized_function(run_own_code_on=[UnsupportedLanguageError]) def nice_number(number, lang='', speech=True, denominators=None): Here, nice_number() itself will be executed in the event that the localizer raises an UnsupportedLanguageError. Arguments: run_own_code_on(list(type), optional) A list of Error types (ValueError, NotImplementedError, etc) which, if they are raised, will trigger the wrapped function's own code. If this argument is omitted, the function itself will never be run. Calls to the wrapped function will be passed to the appropriate, localized function. """ # Make sure everything in run_own_code_on is an Error or None BadTypeError = \ ValueError("@localized_function(run_own_code_on=<>) expected an " "Error type, or a list of Error types. Instead, it " "received this value:\n" + str(run_own_code_on)) # TODO deprecate these kwarg values 6-12 months after v0.3.0 releases if run_own_code_on != [None]: def is_error_type(_type): if not callable(_type): return False _instance = _type() rval = isinstance(_instance, BaseException) if _instance else True del _instance return rval if not isinstance(run_own_code_on, list): try: run_own_code_on = list(run_own_code_on) except TypeError: raise BadTypeError if not all((is_error_type(e) for e in run_own_code_on)): raise BadTypeError # Begin wrapper def localized_function_decorator(func): # Wrapper's logic def _call_localized_function(func, *args, **kwargs): lang_code = None load_langs_on_demand = config.load_langs_on_demand unload_language_afterward = False func_signature = signature(func) func_params = list(func_signature.parameters) lang_param_index = func_params.index('lang') full_lang_code = None # Check if we need to add timezone awareness to any datetime object if config.inject_timezones: for key, value in kwargs.items(): if isinstance(value, datetime) and value.tzinfo is None: kwargs[key] = to_local(value) for idx, value in enumerate(args): if isinstance(value, datetime) and value.tzinfo is None: args = (*args[:idx], to_local(value), *args[idx + 1:]) # Check if we're passing a lang as a kwarg if 'lang' in kwargs.keys(): lang_param = kwargs['lang'] if lang_param is None: warn(NoneLangWarning) lang_code = get_default_lang() else: lang_code = lang_param # Check if we're passing a lang as a positional arg elif lang_param_index < len(args): lang_param = args[lang_param_index] if lang_param is None: warn(NoneLangWarning) lang_code = get_default_lang() elif lang_param in _SUPPORTED_LANGUAGES or \ lang_param in _SUPPORTED_FULL_LOCALIZATIONS: lang_code = args[lang_param_index] args = args[:lang_param_index] + args[lang_param_index+1:] # Turns out, we aren't passing a lang code at all lang_code = lang_code or get_default_lang() if not lang_code: if load_langs_on_demand: raise ModuleNotFoundError("No language module loaded " "and none specified.") else: raise ModuleNotFoundError("No language module loaded.") if lang_code not in _SUPPORTED_LANGUAGES: try: tmp = lang_code __use_tmp = True lang_code = get_primary_lang_code(lang_code) except ValueError: __error = \ UnsupportedLanguageError("\nLanguage '{language}' is not yet " "supported by Lingua Franca. " "Supported language codes " "include the following:\n{supported}" .format( language=lang_code, supported=_SUPPORTED_FULL_LOCALIZATIONS)) if UnsupportedLanguageError in run_own_code_on: raise __error else: warn(DeprecationWarning("The following warning will " "become an exception in a future " "version of Lingua Franca." + str(__error))) lang_code = get_default_lang() full_lang_code = get_full_lang_code() __use_tmp = False if lang_code not in _SUPPORTED_LANGUAGES: _raise_unsupported_language(lang_code) if __use_tmp: full_lang_code = tmp else: full_lang_code = get_full_lang_code(lang_code) # Here comes the ugly business. _module_name = func.__module__.split('.')[-1] _module = import_module(".lang." + _module_name + "_" + lang_code, "lingua_franca") # The nonsense above gets you from lingua_franca.parse # to lingua_franca.lang.parse_xx if _module_name not in _localized_functions.keys(): raise ModuleNotFoundError("Module lingua_franca." + _module_name + " not recognized") if lang_code not in _localized_functions[_module_name].keys(): if load_langs_on_demand: load_language(lang_code) unload_language_afterward = True else: raise ModuleNotFoundError(_module_name + " module of language '" + lang_code + "' is not currently loaded.") func_name = func.__name__.split('.')[-1] # At some point in the past, both the module and the language # were imported/loaded, respectively. # When that happened, we cached the *signature* of each # localized function. # # This is the crucial element that allows us to import funcs # on the fly. # # If we didn't find a localized function to correspond with # the wrapped function, we cached NotImplementedError in its # place. loc_signature = _localized_functions[_module_name][lang_code][func_name] if isinstance(loc_signature, type(NotImplementedError())): raise loc_signature # Now we have the appropriate localized module. Let's get # the localized function. try: localized_func = getattr( _module, func_name + "_" + lang_code) except AttributeError: raise FunctionNotLocalizedError(func_name, lang_code) # We now have a localized function, such as # lingua_franca.parse.extract_datetime_en # Get 'lang' out of its parameters. if 'lang' in kwargs: del kwargs['lang'] args = tuple(arg for arg in list(args) if arg not in (lang_code, full_lang_code)) # Now we call the function, ignoring any kwargs from the # wrapped function that aren't in the localized function. r_val = localized_func(*args, **{arg: val for arg, val in kwargs.items() if arg in loc_signature.parameters}) # Unload all the stuff we just assembled and imported del localized_func del _module if unload_language_afterward: unload_language(lang_code) return r_val # Actual wrapper @wraps(func) def call_localized_function(*args, **kwargs): if run_own_code_on != [type(None)]: try: return _call_localized_function(func, *args, **kwargs) except Exception as e: # Intercept, check for run_own_code_on if any((isinstance(e, error) for error in run_own_code_on)): return func(*args, **kwargs) else: raise e else: # don't intercept any exceptions return _call_localized_function(func, *args, **kwargs) return call_localized_function try: return localized_function_decorator except NotImplementedError as e: warn(str(e)) return def populate_localized_function_dict(lf_module, langs=get_active_langs()): """Returns a dictionary of dictionaries, containing localized functions. Used by the top-level modules to locate, cache, and call localized funcs. Arguments: lf_module(str) - - the name of the top-level module Returns: Dict - - {language_code: {function_name(str): function}} Note: The dictionary returned can be used directly, but it's normally discarded. Rather, this function will create the dictionary as a member of `lingua_franca.internal._localized_functions`, and its members are invoked via the `@localized_function` decorator. Example: populate_localized_function_dict("format")["en"]["pronounce_number"](1) "one" """ bad_lang_code = "Language code '{}' is registered with" \ " Lingua Franca, but its " + lf_module + " module" \ " could not be found." return_dict = {} for lang_code in langs: primary_lang_code = get_primary_lang_code(lang_code) return_dict[primary_lang_code] = {} _FUNCTION_NOT_FOUND = "" try: lang_common_data = import_module(".lang.common_data_" + primary_lang_code, "lingua_franca") _FUNCTION_NOT_FOUND = getattr(lang_common_data, "_FUNCTION_NOT_IMPLEMENTED_WARNING") del lang_common_data except Exception: _FUNCTION_NOT_FOUND = "This function has not been implemented" \ " in the specified language." _FUNCTION_NOT_FOUND = FunctionNotLocalizedError(_FUNCTION_NOT_FOUND) try: mod = import_module(".lang." + lf_module + "_" + primary_lang_code, "lingua_franca") except ModuleNotFoundError: warn(Warning(bad_lang_code.format(primary_lang_code))) continue function_names = getattr(import_module("." + lf_module, "lingua_franca"), "_REGISTERED_FUNCTIONS") for function_name in function_names: try: function = getattr(mod, function_name + "_" + primary_lang_code) function_signature = signature(function) del function except AttributeError: function_signature = _FUNCTION_NOT_FOUND # TODO log these occurrences: "function 'function_name' not # implemented in language 'primary_lang_code'" # # Perhaps provide this info to autodocs, to help volunteers # identify the functions in need of localization return_dict[primary_lang_code][function_name] = function_signature del mod _localized_functions[lf_module] = return_dict return _localized_functions[lf_module] def resolve_resource_file(res_name, data_dir=None): """Convert a resource into an absolute filename. Resource names are in the form: 'filename.ext' or 'path/filename.ext' The system wil look for ~/.mycroft/res_name first, and if not found will look at / opt/mycroft/res_name, then finally it will look for res_name in the 'mycroft/res' folder of the source code package. Example: With mycroft running as the user 'bob', if you called resolve_resource_file('snd/beep.wav') it would return either '/home/bob/.mycroft/snd/beep.wav' or '/opt/mycroft/snd/beep.wav' or '.../mycroft/res/snd/beep.wav', where the '...' is replaced by the path where the package has been installed. Args: res_name(str): a resource path/name Returns: str: path to resource or None if no resource found """ # First look for fully qualified file (e.g. a user setting) if os.path.isfile(res_name): return res_name # Now look for ~/.mycroft/res_name (in user folder) filename = os.path.expanduser("~/.mycroft/" + res_name) if os.path.isfile(filename): return filename # Next look for /opt/mycroft/res/res_name data_dir = data_dir or os.path.expanduser("/opt/mycroft/res/") filename = os.path.expanduser(os.path.join(data_dir, res_name)) if os.path.isfile(filename): return filename # Finally look for it in the source package filename = os.path.join(os.path.dirname(__file__), 'res', res_name) filename = os.path.abspath(os.path.normpath(filename)) if os.path.isfile(filename): return filename return None # Resource cannot be resolved def lookup_variant(mappings, key="variant"): """function decorator maps strings to Enums expected by language specific functions mappings can be used to translate values read from configuration files Example usage: @lookup_variant({ "default": TimeVariant.DEFAULT, "traditional": TimeVariant.TRADITIONAL }) def nice_time_XX(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): variant = variant or TimeVariant.DEFAULT (...) """ if not isinstance(mappings, dict): raise ValueError # Begin wrapper def lang_variant_function_decorator(func): @wraps(func) def call_function(*args, **kwargs): if key in kwargs and isinstance(kwargs[key], str): if kwargs[key] in mappings: kwargs[key] = mappings[kwargs[key]] else: raise ValueError("Unknown variant, mapping does not " "exist for {v}".format(v=key)) return func(*args, **kwargs) return call_function try: return lang_variant_function_decorator except NotImplementedError as e: warn(str(e)) return lingua-franca-release-v0.4.3/lingua_franca/lang/000077500000000000000000000000001426211343400215435ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/lang/__init__.py000066400000000000000000000044521426211343400236610ustar00rootroot00000000000000# Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from warnings import warn from lingua_franca.internal import get_default_lang, \ set_default_lang, get_primary_lang_code as gplc, get_full_lang_code as gflc def get_active_lang(): """ Get the active full language code (BCP-47) Returns: str: A BCP-47 language code, e.g. ("en-us", or "pt-pt") """ _getlang = "Direct imports from lingua_franca.lang" " have been deprecated. Use" " lingua_franca.get_default_lang()" warn(_getlang, DeprecationWarning) return get_default_lang() def set_active_lang(lang_code): """ Set the active BCP-47 language code to be used in formatting/parsing Args: lang (str): BCP-47 language code, e.g. "en-us" or "es-mx" """ _setlang = "Direct imports from lingua_franca.lang" " have been deprecated. Use" " lingua_franca.set_default_lang()" warn(_setlang, DeprecationWarning) set_default_lang(lang_code=lang_code) def get_primary_lang_code(lang=None): """ Get the primary language code Args: lang (str, optional): A BCP-47 language code, or None for default Returns: str: A primary language family, such as "en", "de" or "pt" """ warn("Direct imports from lingua_franca.lang have been deprecated. Use" " lingua_franca.get_primary_lang_code()", DeprecationWarning) return gplc(lang=lang) def get_full_lang_code(lang=None): """ Get the full language code Args: lang (str, optional): A BCP-47 language code, or None for default Returns: str: A full language code, such as "en-us" or "de-de" """ warn("Direct imports from lingua_franca.lang have been deprecated. Use" " lingua_franca.get_full_lang_code()", DeprecationWarning) return gflc(lang=lang) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_ca.py000066400000000000000000000100131426211343400250340ustar00rootroot00000000000000_FUNCTION_NOT_IMPLEMENTED_WARNING = "aquesta funció encara no s'ha implementat en 'ca'" # Undefined articles ["un", "una", "uns", "unes"] can not be supressed, # in CA, "un cavall" means "a horse" or "one horse". _ARTICLES_CA = ["el", "la", "l", "lo", "els", "les", "los"] # word rules for gender _FEMALE_ENDINGS_CA = ["a", "esa", "essa", "esses", "eses", "ena", "enes", "ques", "asi", "esi", "isi", "osi", "ut", "at", "eta", "etes", "tja", "tges", "ica", "iques", "ada", "ades"] _MALE_ENDINGS_CA = ["o", "os", "ll", "lls", "ig", "igs", "itjos", "rs", "et", "ets", "ès", "ns", "ic", "ics", "at", "ats"] # special cases, word lookup for words not covered by above rule _GENDERS_CA = { "dones": "f", "home": "m", "pell": "f", "pells": "f" } # context rules for gender _MALE_DETERMINANTS_CA = ["el", "els", "l", "lo", "es", "aquest", "aquests", "aquell", "aquells", "aqueix", "aqueixos", "algun", "alguns", "este", "estos", "altre", "mon", "mos", "mons", "meus", "meus"] _FEMALE_DETERMINANTS_CA = ["la", "les", "sa", "ses", "aquesta", "aquestes", "aquella", "aquelles", "aqueixa", "aqueixes", "alguna", "algunes", "esta", "estes", "altra", "ma", "mes", "meva", "meua", "meves"] _NUMBERS_CA = { "zero": 0, "u": 1, "un": 1, "una": 1, "uns": 1, "unes": 1, "primer": 1, "primera": 1, "segon": 2, "segona": 2, "tercer": 3, "tercera": 3, "dos": 2, "dues": 2, "tres": 3, "quatre": 4, "cinc": 5, "sis": 6, "set": 7, "vuit": 8, "huit": 8, "nou": 9, "deu": 10, "onze": 11, "dotze": 12, "tretze": 13, "catorze": 14, "quinze": 15, "setze": 16, "disset": 17, "divuit": 18, "dinou": 19, "vint": 20, "trenta": 30, "quaranta": 40, "cinquanta": 50, "seixanta": 60, "setanta": 70, "vuitanta": 80, "noranta": 90, "cent": 100, "cents": 100, "dos-cents": 200, "dues-centes": 200, "tres-cents": 300, "tres-centes": 300, "quatre-cents": 400, "quatre-centes": 400, "cinc-cents": 500, "cinc-centes": 500, "sis-cents": 600, "sis-centes": 600, "set--cents": 700, "set-centes": 700, "vuit-cents": 800, "vuit-centes": 800, "nou-cents": 900, "nou-centes": 900, "mil": 1000, "milió": 1000000 } _FRACTION_STRING_CA = { 2: 'mig', 3: 'terç', 4: 'quart', 5: 'cinquè', 6: 'sisè', 7: 'setè', 8: 'vuitè', 9: 'novè', 10: 'desè', 11: 'onzè', 12: 'dotzè', 13: 'tretzè', 14: 'catorzè', 15: 'quinzè', 16: 'setzè', 17: 'dissetè', 18: 'divuitè', 19: 'dinovè', 20: 'vintè', 30: 'trentè', 100: 'centè', 1000: 'milè' } _NUM_STRING_CA = { 0: 'zero', 1: 'un', 2: 'dos', 3: 'tres', 4: 'quatre', 5: 'cinc', 6: 'sis', 7: 'set', 8: 'vuit', 9: 'nou', 10: 'deu', 11: 'onze', 12: 'dotze', 13: 'tretze', 14: 'catorze', 15: 'quinze', 16: 'setze', 17: 'disset', 18: 'divuit', 19: 'dinou', 20: 'vint', 30: 'trenta', 40: 'quaranta', 50: 'cinquanta', 60: 'seixanta', 70: 'setanta', 80: 'vuitanta', 90: 'noranta' } _TENS_CA = { "vint": 20, "trenta": 30, "quaranta": 40, "cinquanta": 50, "seixanta": 60, "setanta": 70, "vuitanta": 80, "huitanta": 80, "noranta": 90 } _AFTER_TENS_CA = { "u": 1, "un": 1, "dos": 2, "dues": 2, "tres": 3, "quatre": 4, "cinc": 5, "sis": 6, "set": 7, "vuit": 8, "huit": 8, "nou": 9 } _BEFORE_HUNDREDS_CA = { "dos": 2, "dues": 2, "tres": 3, "quatre": 4, "cinc": 5, "sis": 6, "set": 7, "vuit": 8, "huit": 8, "nou": 9, } _HUNDREDS_CA = { "cent": 100, "cents": 100, "centes": 100 } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_cs.py000066400000000000000000000161571426211343400250750ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict #_ARTICLES_CS = {} _NUM_STRING_CS = { 0: 'nula', 1: 'jedna', 2: 'dva', 3: 'tři', 4: 'čtyři', 5: 'pět', 6: 'šest', 7: 'sedm', 8: 'osm', 9: 'devět', 10: 'deset', 11: 'jedenáct', 12: 'dvanáct', 13: 'třináct', 14: 'čtrnáct', 15: 'patnáct', 16: 'šestnáct', 17: 'sedmnáct', 18: 'osmnáct', 19: 'devatenáct', 20: 'dvacet', 30: 'třicet', 40: 'čtyřicet', 50: 'padesát', 60: 'šedesát', 70: 'sedmdesát', 80: 'osmdesát', 90: 'devadesát' } _FRACTION_STRING_CS = { 2: 'polovina', 3: 'třetina', 4: 'čtvrtina', 5: 'pětina', 6: 'šestina', 7: 'sedmina', 8: 'osmina', 9: 'devítina', 10: 'desetina', 11: 'jedenáctina', 12: 'dvanáctina', 13: 'třináctina', 14: 'čtrnáctina', 15: 'patnáctina', 16: 'šestnáctina', 17: 'sedmnáctina', 18: 'osmnáctina', 19: 'devatenáctina', 20: 'dvacetina', 30: 'třicetina', 40: 'čtyřicetina', 50: 'padesátina', 60: 'šedesátina', 70: 'sedmdesátina', 80: 'osmdesátina', 90: 'devadesátina', 1e2: 'setina', 1e3: 'tisícina' } _LONG_SCALE_CS = OrderedDict([ (100, 'sto'), (1000, 'tisíc'), (1000000, 'milion'), (1e9, "miliarda"), (1e12, "bilion"), (1e15, "biliarda"), (1e18, "trilion"), (1e21, "triliarda"), (1e24, "kvadrilion"), (1e27, "kvadriliarda"), (1e30, "kvintilion"), (1e33, "kvintiliarda"), (1e36, "sextilion"), (1e39, "sextiliarda"), (1e42, "septilion"), (1e45, "septiliarda"), (1e48, "oktilion"), (1e51, "oktiliarda"), (1e54, "nonilion"), (1e57, "noniliarda"), (1e60, "decilion"), (1e63, "deciliarda"), (1e120, "vigintilion"), (1e180, "trigintilion"), (1e303, "kvinkvagintiliarda"), (1e600, "centilion"), (1e603, "centiliarda") ]) _SHORT_SCALE_CS = OrderedDict([ (100, 'sto'), (1000, 'tisíc'), (1000000, 'million'), (1e9, "billion"), (1e12, 'trillion'), (1e15, "quadrillion"), (1e18, "quintillion"), (1e21, "sextillion"), (1e24, "septillion"), (1e27, "octillion"), (1e30, "nonillion"), (1e33, "decillion"), (1e36, "undecillion"), (1e39, "duodecillion"), (1e42, "tredecillion"), (1e45, "quadrdecillion"), (1e48, "quindecillion"), (1e51, "sexdecillion"), (1e54, "septendecillion"), (1e57, "octodecillion"), (1e60, "novemdecillion"), (1e63, "vigintillion"), (1e66, "unvigintillion"), (1e69, "uuovigintillion"), (1e72, "tresvigintillion"), (1e75, "quattuorvigintillion"), (1e78, "quinquavigintillion"), (1e81, "qesvigintillion"), (1e84, "septemvigintillion"), (1e87, "octovigintillion"), (1e90, "novemvigintillion"), (1e93, "trigintillion"), (1e96, "untrigintillion"), (1e99, "duotrigintillion"), (1e102, "trestrigintillion"), (1e105, "quattuortrigintillion"), (1e108, "quinquatrigintillion"), (1e111, "sestrigintillion"), (1e114, "septentrigintillion"), (1e117, "octotrigintillion"), (1e120, "noventrigintillion"), (1e123, "quadragintillion"), (1e153, "quinquagintillion"), (1e183, "sexagintillion"), (1e213, "septuagintillion"), (1e243, "octogintillion"), (1e273, "nonagintillion"), (1e303, "centillion"), (1e306, "uncentillion"), (1e309, "duocentillion"), (1e312, "trescentillion"), (1e333, "decicentillion"), (1e336, "undecicentillion"), (1e363, "viginticentillion"), (1e366, "unviginticentillion"), (1e393, "trigintacentillion"), (1e423, "quadragintacentillion"), (1e453, "quinquagintacentillion"), (1e483, "sexagintacentillion"), (1e513, "septuagintacentillion"), (1e543, "ctogintacentillion"), (1e573, "nonagintacentillion"), (1e603, "ducentillion"), (1e903, "trecentillion"), (1e1203, "quadringentillion"), (1e1503, "quingentillion"), (1e1803, "sescentillion"), (1e2103, "septingentillion"), (1e2403, "octingentillion"), (1e2703, "nongentillion"), (1e3003, "millinillion") ]) _ORDINAL_BASE_CS = { 1: 'první', 2: 'druhý', 3: 'třetí', 4: 'čtvrtý', 5: 'pátý', 6: 'šestý', 7: 'sedmý', 8: 'osmý', 9: 'devátý', 10: 'desátý', 11: 'jedenáctý', 12: 'dvanáctý', 13: 'třináctý', 14: 'čtrnáctý', 15: 'patnáctý', 16: 'šestnáctý', 17: 'sedmnáctý', 18: 'osmnáctý', 19: 'devatenáctý', 20: 'dvacátý', 30: 'třicátý', 40: "čtyřicátý", 50: "padesátý", 60: "šedesátý", 70: "sedmdesátý", 80: "osmdesátý", 90: "devadesátý", 1e2: "stý", 1e3: "tisící" } _SHORT_ORDINAL_CS = { 1e6: "miliontý", 1e9: "billiontý", 1e12: "trilliontý", 1e15: "quadrilliontý", 1e18: "quintilliontý", 1e21: "sextilliontý", 1e24: "septilliontý", 1e27: "oktiliontý", 1e30: "nonilliontý", 1e33: "decilliontý" # TODO > 1e-33 } _SHORT_ORDINAL_CS.update(_ORDINAL_BASE_CS) _LONG_ORDINAL_CS = { 1e6: "miliontý", 1e9: "miliardtý", 1e12: "biliontý", 1e15: "biliardtý", 1e18: "triliontý", 1e21: "triliardtý", 1e24: "kvadriliontý", 1e27: "kvadriliardtý", 1e30: "kvintiliontý", 1e33: "kvintiliardtý", 1e36: "sextiliontý", 1e39: "sextiliardtý", 1e42: "septiliontý", 1e45: "septiliardtý", 1e48: "oktilion", 1e51: "oktiliardtý", 1e54: "noniliontý", 1e57: "noniliardtý", 1e60: "deciliontý" # TODO > 1e60 } _LONG_ORDINAL_CS.update(_ORDINAL_BASE_CS) # Months _MONTHS_CONVERSION = { 0: "january", 1: "february", 2: "march", 3: "april", 4: "may", 5: "june", 6: "july", 7: "august", 8: "september", 9: "october", 10: "november", 11: "december" } _MONTHS_CZECH = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', 'červenec', 'srpen', 'září', 'říjen', 'listopad', 'prosinec'] # Time _TIME_UNITS_CONVERSION = { 'mikrosekund': 'microseconds', 'milisekund': 'milliseconds', 'sekundu': 'seconds', 'sekundy': 'seconds', 'sekund': 'seconds', 'minutu': 'minutes', 'minuty': 'minutes', 'minut': 'minutes', 'hodin': 'hours', 'den': 'days', # 1 day 'dny': 'days', # 2-4 days 'dnů': 'days', # 5+ days 'dní': 'days', # 5+ days - different inflection 'dne': 'days', # a half day 'týden': 'weeks', 'týdny': 'weeks', 'týdnů': 'weeks' } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_da.py000066400000000000000000000051331426211343400250440ustar00rootroot00000000000000_FUNCTION_NOT_IMPLEMENTED_WARNING = "Denne funktion er ikke implementeret i 'dk'." _DA_NUMBERS = { 'nul': 0, 'en': 1, 'et': 1, 'to': 2, 'tre': 3, 'fire': 4, 'fem': 5, 'seks': 6, 'syv': 7, 'otte': 8, 'ni': 9, 'ti': 10, 'elve': 11, 'tolv': 12, 'tretten': 13, 'fjorten': 14, 'femten': 15, 'seksten': 16, 'sytten': 17, 'atten': 18, 'nitten': 19, 'tyve': 20, 'enogtyve': 21, 'toogtyve': 22, 'treogtyve': 23, 'fireogtyve': 24, 'femogtyve': 25, 'seksogtyve': 26, 'syvogtyve': 27, 'otteogtyve': 28, 'niogtyve': 29, 'tredive': 30, 'enogtredive': 31, 'fyrrre': 40, 'halvtres': 50, 'tres': 60, 'halvfjers': 70, 'firs': 80, 'halvfems': 90, 'hunderede': 100, 'tohundrede': 200, 'trehundrede': 300, 'firehundrede': 400, 'femhundrede': 500, 'sekshundrede': 600, 'syvhundrede': 700, 'ottehundrede': 800, 'nihundrede': 900, 'tusinde': 1000, 'million': 1000000 } _MONTHS_DA = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', 'juli', 'august', 'september', 'oktober', 'november', 'dezember'] _NUM_STRING_DA = { 0: 'nul', 1: 'en', 2: 'to', 3: 'tre', 4: 'fire', 5: 'fem', 6: 'seks', 7: 'syv', 8: 'otte', 9: 'ni', 10: 'ti', 11: 'elve', 12: 'tolv', 13: 'tretten', 14: 'fjorten', 15: 'femten', 16: 'seksten', 17: 'sytten', 18: 'atten', 19: 'nitten', 20: 'tyve', 30: 'tredive', 40: 'fyrre', 50: 'halvtres', 60: 'tres', 70: 'halvfjers', 80: 'firs', 90: 'halvfems', 100: 'hundrede' } _NUM_POWERS_OF_TEN = [ 'hundred', 'tusind', 'million', 'milliard', 'billion', 'billiard', 'trillion', 'trilliard' ] _FRACTION_STRING_DA = { 2: 'halv', 3: 'trediedel', 4: 'fjerdedel', 5: 'femtedel', 6: 'sjettedel', 7: 'syvendedel', 8: 'ottendedel', 9: 'niendedel', 10: 'tiendedel', 11: 'elftedel', 12: 'tolvtedel', 13: 'trettendedel', 14: 'fjortendedel', 15: 'femtendedel', 16: 'sejstendedel', 17: 'syttendedel', 18: 'attendedel', 19: 'nittendedel', 20: 'tyvendedel' } # Numbers below 1 million are written in one word in Danish, yielding very # long words # In some circumstances it may better to seperate individual words # Set _EXTRA_SPACE_DA=" " for separating numbers below 1 million ( # orthographically incorrect) # Set _EXTRA_SPACE_DA="" for correct spelling, this is standard # _EXTRA_SPACE_DA = " " _EXTRA_SPACE_DA = "" lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_de.py000066400000000000000000000056601426211343400250550ustar00rootroot00000000000000_DE_NUMBERS = { 'null': 0, 'ein': 1, 'eins': 1, 'eine': 1, 'einer': 1, 'einem': 1, 'einen': 1, 'eines': 1, 'zwei': 2, 'drei': 3, 'vier': 4, 'fünf': 5, 'sechs': 6, 'sieben': 7, 'acht': 8, 'neun': 9, 'zehn': 10, 'elf': 11, 'zwölf': 12, 'dreizehn': 13, 'vierzehn': 14, 'fünfzehn': 15, 'sechzehn': 16, 'siebzehn': 17, 'achtzehn': 18, 'neunzehn': 19, 'zwanzig': 20, 'einundzwanzig': 21, 'zweiundzwanzig': 22, 'dreiundzwanzig': 23, 'vierundzwanzig': 24, 'fünfundzwanzig': 25, 'sechsundzwanzig': 26, 'siebenundzwanzig': 27, 'achtundzwanzig': 28, 'neunundzwanzig': 29, 'dreißig': 30, 'einunddreißig': 31, 'vierzig': 40, 'fünfzig': 50, 'sechzig': 60, 'siebzig': 70, 'achtzig': 80, 'neunzig': 90, 'hundert': 100, 'zweihundert': 200, 'dreihundert': 300, 'vierhundert': 400, 'fünfhundert': 500, 'sechshundert': 600, 'siebenhundert': 700, 'achthundert': 800, 'neunhundert': 900, 'tausend': 1000, 'million': 1000000 } _MONTHS_DE = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', 'juli', 'august', 'september', 'oktober', 'november', 'dezember'] _NUM_STRING_DE = { 0: 'null', 1: 'ein', # ein Viertel etc., nicht eins Viertel 2: 'zwei', 3: 'drei', 4: 'vier', 5: 'fünf', 6: 'sechs', 7: 'sieben', 8: 'acht', 9: 'neun', 10: 'zehn', 11: 'elf', 12: 'zwölf', 13: 'dreizehn', 14: 'vierzehn', 15: 'fünfzehn', 16: 'sechzehn', 17: 'siebzehn', 18: 'achtzehn', 19: 'neunzehn', 20: 'zwanzig', 30: 'dreißig', 40: 'vierzig', 50: 'fünfzig', 60: 'sechzig', 70: 'siebzig', 80: 'achtzig', 90: 'neunzig', 100: 'hundert' } # German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales # Currently, numbers are limited to 1000000000000000000000000, # but _NUM_POWERS_OF_TEN can be extended to include additional number words _NUM_POWERS_OF_TEN_DE = [ '', 'tausend', 'Million', 'Milliarde', 'Billion', 'Billiarde', 'Trillion', 'Trilliarde' ] _FRACTION_STRING_DE = { 2: 'halb', 3: 'drittel', 4: 'viertel', 5: 'fünftel', 6: 'sechstel', 7: 'siebtel', 8: 'achtel', 9: 'neuntel', 10: 'zehntel', 11: 'elftel', 12: 'zwölftel', 13: 'dreizehntel', 14: 'vierzehntel', 15: 'fünfzehntel', 16: 'sechzehntel', 17: 'siebzehntel', 18: 'achtzehntel', 19: 'neunzehntel', 20: 'zwanzigstel' } # Numbers below 1 million are written in one word in German, yielding very # long words # In some circumstances it may better to seperate individual words # Set _EXTRA_SPACE_DA=" " for separating numbers below 1 million ( # orthographically incorrect) # Set _EXTRA_SPACE_DA="" for correct spelling, this is standard # _EXTRA_SPACE_DA = " " _EXTRA_SPACE_DE = "" lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_en.py000066400000000000000000000165331426211343400250700ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict from .parse_common import invert_dict _FUNCTION_NOT_IMPLEMENTED_WARNING = "The requested function is not implemented in English." _ARTICLES_EN = {'a', 'an', 'the'} _NUM_STRING_EN = { 0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten', 11: 'eleven', 12: 'twelve', 13: 'thirteen', 14: 'fourteen', 15: 'fifteen', 16: 'sixteen', 17: 'seventeen', 18: 'eighteen', 19: 'nineteen', 20: 'twenty', 30: 'thirty', 40: 'forty', 50: 'fifty', 60: 'sixty', 70: 'seventy', 80: 'eighty', 90: 'ninety' } _FRACTION_STRING_EN = { 2: 'half', 3: 'third', 4: 'forth', 5: 'fifth', 6: 'sixth', 7: 'seventh', 8: 'eigth', 9: 'ninth', 10: 'tenth', 11: 'eleventh', 12: 'twelveth', 13: 'thirteenth', 14: 'fourteenth', 15: 'fifteenth', 16: 'sixteenth', 17: 'seventeenth', 18: 'eighteenth', 19: 'nineteenth', 20: 'twentyith' } _LONG_SCALE_EN = OrderedDict([ (100, 'hundred'), (1000, 'thousand'), (1000000, 'million'), (1e12, "billion"), (1e18, 'trillion'), (1e24, "quadrillion"), (1e30, "quintillion"), (1e36, "sextillion"), (1e42, "septillion"), (1e48, "octillion"), (1e54, "nonillion"), (1e60, "decillion"), (1e66, "undecillion"), (1e72, "duodecillion"), (1e78, "tredecillion"), (1e84, "quattuordecillion"), (1e90, "quinquadecillion"), (1e96, "sedecillion"), (1e102, "septendecillion"), (1e108, "octodecillion"), (1e114, "novendecillion"), (1e120, "vigintillion"), (1e306, "unquinquagintillion"), (1e312, "duoquinquagintillion"), (1e336, "sesquinquagintillion"), (1e366, "unsexagintillion") ]) _SHORT_SCALE_EN = OrderedDict([ (100, 'hundred'), (1000, 'thousand'), (1000000, 'million'), (1e9, "billion"), (1e12, 'trillion'), (1e15, "quadrillion"), (1e18, "quintillion"), (1e21, "sextillion"), (1e24, "septillion"), (1e27, "octillion"), (1e30, "nonillion"), (1e33, "decillion"), (1e36, "undecillion"), (1e39, "duodecillion"), (1e42, "tredecillion"), (1e45, "quattuordecillion"), (1e48, "quinquadecillion"), (1e51, "sedecillion"), (1e54, "septendecillion"), (1e57, "octodecillion"), (1e60, "novendecillion"), (1e63, "vigintillion"), (1e66, "unvigintillion"), (1e69, "uuovigintillion"), (1e72, "tresvigintillion"), (1e75, "quattuorvigintillion"), (1e78, "quinquavigintillion"), (1e81, "qesvigintillion"), (1e84, "septemvigintillion"), (1e87, "octovigintillion"), (1e90, "novemvigintillion"), (1e93, "trigintillion"), (1e96, "untrigintillion"), (1e99, "duotrigintillion"), (1e102, "trestrigintillion"), (1e105, "quattuortrigintillion"), (1e108, "quinquatrigintillion"), (1e111, "sestrigintillion"), (1e114, "septentrigintillion"), (1e117, "octotrigintillion"), (1e120, "noventrigintillion"), (1e123, "quadragintillion"), (1e153, "quinquagintillion"), (1e183, "sexagintillion"), (1e213, "septuagintillion"), (1e243, "octogintillion"), (1e273, "nonagintillion"), (1e303, "centillion"), (1e306, "uncentillion"), (1e309, "duocentillion"), (1e312, "trescentillion"), (1e333, "decicentillion"), (1e336, "undecicentillion"), (1e363, "viginticentillion"), (1e366, "unviginticentillion"), (1e393, "trigintacentillion"), (1e423, "quadragintacentillion"), (1e453, "quinquagintacentillion"), (1e483, "sexagintacentillion"), (1e513, "septuagintacentillion"), (1e543, "ctogintacentillion"), (1e573, "nonagintacentillion"), (1e603, "ducentillion"), (1e903, "trecentillion"), (1e1203, "quadringentillion"), (1e1503, "quingentillion"), (1e1803, "sescentillion"), (1e2103, "septingentillion"), (1e2403, "octingentillion"), (1e2703, "nongentillion"), (1e3003, "millinillion") ]) _ORDINAL_BASE_EN = { 1: 'first', 2: 'second', 3: 'third', 4: 'fourth', 5: 'fifth', 6: 'sixth', 7: 'seventh', 8: 'eighth', 9: 'ninth', 10: 'tenth', 11: 'eleventh', 12: 'twelfth', 13: 'thirteenth', 14: 'fourteenth', 15: 'fifteenth', 16: 'sixteenth', 17: 'seventeenth', 18: 'eighteenth', 19: 'nineteenth', 20: 'twentieth', 30: 'thirtieth', 40: "fortieth", 50: "fiftieth", 60: "sixtieth", 70: "seventieth", 80: "eightieth", 90: "ninetieth", 1e2: "hundredth", 1e3: "thousandth" } _SHORT_ORDINAL_EN = { 1e6: "millionth", 1e9: "billionth", 1e12: "trillionth", 1e15: "quadrillionth", 1e18: "quintillionth", 1e21: "sextillionth", 1e24: "septillionth", 1e27: "octillionth", 1e30: "nonillionth", 1e33: "decillionth" # TODO > 1e-33 } _SHORT_ORDINAL_EN.update(_ORDINAL_BASE_EN) _LONG_ORDINAL_EN = { 1e6: "millionth", 1e12: "billionth", 1e18: "trillionth", 1e24: "quadrillionth", 1e30: "quintillionth", 1e36: "sextillionth", 1e42: "septillionth", 1e48: "octillionth", 1e54: "nonillionth", 1e60: "decillionth" # TODO > 1e60 } _LONG_ORDINAL_EN.update(_ORDINAL_BASE_EN) # negate next number (-2 = 0 - 2) _NEGATIVES_EN = {"negative", "minus"} # sum the next number (twenty two = 20 + 2) _SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50', 'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'} def _generate_plurals_en(originals): """ Return a new set or dict containing the plural form of the original values, In English this means all with 's' appended to them. Args: originals set(str) or dict(str, any): values to pluralize Returns: set(str) or dict(str, any) """ # TODO migrate to https://github.com/MycroftAI/lingua-franca/pull/36 if isinstance(originals, dict): return {key + 's': value for key, value in originals.items()} return {value + "s" for value in originals} _MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \ _generate_plurals_en(_LONG_SCALE_EN.values()) _MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \ _generate_plurals_en(_SHORT_SCALE_EN.values()) # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) _FRACTION_MARKER_EN = {"and"} # decimal marker ( 1 point 5 = 1 + 0.5) _DECIMAL_MARKER_EN = {"point", "dot"} _STRING_NUM_EN = invert_dict(_NUM_STRING_EN) _STRING_NUM_EN.update(_generate_plurals_en(_STRING_NUM_EN)) _SPOKEN_EXTRA_NUM_EN = { "half": 0.5, "halves": 0.5, "couple": 2 } _STRING_SHORT_ORDINAL_EN = invert_dict(_SHORT_ORDINAL_EN) _STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_es.py000066400000000000000000000164671426211343400251030ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # NOTE: This file as no use yet. It needs to be called from other functions from collections import OrderedDict _ARTICLES_ES = {'el', 'la', 'los', 'las'} _NUM_STRING_ES = { 0: 'cero', 1: 'uno', 2: 'dos', 3: 'tres', 4: 'cuatro', 5: 'cinco', 6: 'seis', 7: 'siete', 8: 'ocho', 9: 'nueve', 10: 'diez', 11: 'once', 12: 'doce', 13: 'trece', 14: 'catorce', 15: 'quince', 16: 'dieciséis', 17: 'diecisete', 18: 'dieciocho', 19: 'diecinueve', 20: 'veinte', 30: 'treinta', 40: 'cuarenta', 50: 'cincuenta', 60: 'sesenta', 70: 'setenta', 80: 'ochenta', 90: 'noventa' } _STRING_NUM_ES = { "cero": 0, "un": 1, "uno": 1, "una": 1, "dos": 2, "tres": 3, "trés": 3, "cuatro": 4, "cinco": 5, "seis": 6, "siete": 7, "ocho": 8, "nueve": 9, "diez": 10, "once": 11, "doce": 12, "trece": 13, "catorce": 14, "quince": 15, "dieciseis": 16, "dieciséis": 16, "diecisiete": 17, "dieciocho": 18, "diecinueve": 19, "veinte": 20, "veintiuno": 21, "veintidos": 22, "veintitres": 23, "veintidós": 22, "veintitrés": 23, "veinticuatro": 24, "veinticinco": 25, "veintiséis": 26, "veintiseis": 26, "veintisiete": 27, "veintiocho": 28, "veintinueve": 29, "treinta": 30, "cuarenta": 40, "cincuenta": 50, "sesenta": 60, "setenta": 70, "ochenta": 80, "noventa": 90, "cien": 100, "ciento": 100, "doscientos": 200, "doscientas": 200, "trescientos": 300, "trescientas": 300, "cuatrocientos": 400, "cuatrocientas": 400, "quinientos": 500, "quinientas": 500, "seiscientos": 600, "seiscientas": 600, "setecientos": 700, "setecientas": 700, "ochocientos": 800, "ochocientas": 800, "novecientos": 900, "novecientas": 900, "mil": 1000} _FRACTION_STRING_ES = { 2: 'medio', 3: 'tercio', 4: 'cuarto', 5: 'quinto', 6: 'sexto', 7: 'séptimo', 8: 'octavo', 9: 'noveno', 10: 'décimo', 11: 'onceavo', 12: 'doceavo', 13: 'treceavo', 14: 'catorceavo', 15: 'quinceavo', 16: 'dieciseisavo', 17: 'diecisieteavo', 18: 'dieciochoavo', 19: 'diecinueveavo', 20: 'veinteavo' } # https://www.grobauer.at/es_eur/zahlnamen.php _LONG_SCALE_ES = OrderedDict([ (100, 'centena'), (1000, 'millar'), (1000000, 'millón'), (1e9, "millardo"), (1e12, "billón"), (1e18, 'trillón'), (1e24, "cuatrillón"), (1e30, "quintillón"), (1e36, "sextillón"), (1e42, "septillón"), (1e48, "octillón"), (1e54, "nonillón"), (1e60, "decillón"), (1e66, "undecillón"), (1e72, "duodecillón"), (1e78, "tredecillón"), (1e84, "cuatrodecillón"), (1e90, "quindecillón"), (1e96, "sexdecillón"), (1e102, "septendecillón"), (1e108, "octodecillón"), (1e114, "novendecillón"), (1e120, "vigintillón"), (1e306, "unquinquagintillón"), (1e312, "duoquinquagintillón"), (1e336, "sexquinquagintillón"), (1e366, "unsexagintillón") ]) _SHORT_SCALE_ES = OrderedDict([ (100, 'centena'), (1000, 'millar'), (1000000, 'millón'), (1e9, "billón"), (1e12, 'trillón'), (1e15, "cuatrillón"), (1e18, "quintillón"), (1e21, "sextillón"), (1e24, "septillón"), (1e27, "octillón"), (1e30, "nonillón"), (1e33, "decillón"), (1e36, "undecillón"), (1e39, "duodecillón"), (1e42, "tredecillón"), (1e45, "cuatrodecillón"), (1e48, "quindecillón"), (1e51, "sexdecillón"), (1e54, "septendecillón"), (1e57, "octodecillón"), (1e60, "novendecillón"), (1e63, "vigintillón"), (1e66, "unvigintillón"), (1e69, "uuovigintillón"), (1e72, "tresvigintillón"), (1e75, "quattuorvigintillón"), (1e78, "quinquavigintillón"), (1e81, "qesvigintillón"), (1e84, "septemvigintillón"), (1e87, "octovigintillón"), (1e90, "novemvigintillón"), (1e93, "trigintillón"), (1e96, "untrigintillón"), (1e99, "duotrigintillón"), (1e102, "trestrigintillón"), (1e105, "quattuortrigintillón"), (1e108, "quinquatrigintillón"), (1e111, "sestrigintillón"), (1e114, "septentrigintillón"), (1e117, "octotrigintillón"), (1e120, "noventrigintillón"), (1e123, "quadragintillón"), (1e153, "quinquagintillón"), (1e183, "sexagintillón"), (1e213, "septuagintillón"), (1e243, "octogintillón"), (1e273, "nonagintillón"), (1e303, "centillón"), (1e306, "uncentillón"), (1e309, "duocentillón"), (1e312, "trescentillón"), (1e333, "decicentillón"), (1e336, "undecicentillón"), (1e363, "viginticentillón"), (1e366, "unviginticentillón"), (1e393, "trigintacentillón"), (1e423, "quadragintacentillón"), (1e453, "quinquagintacentillón"), (1e483, "sexagintacentillón"), (1e513, "septuagintacentillón"), (1e543, "ctogintacentillón"), (1e573, "nonagintacentillón"), (1e603, "ducentillón"), (1e903, "trecentillón"), (1e1203, "quadringentillón"), (1e1503, "quingentillón"), (1e1803, "sexcentillón"), (1e2103, "septingentillón"), (1e2403, "octingentillón"), (1e2703, "nongentillón"), (1e3003, "millinillón") ]) # TODO: female forms. _ORDINAL_STRING_BASE_ES = { 1: 'primero', 2: 'segundo', 3: 'tercero', 4: 'cuarto', 5: 'quinto', 6: 'sexto', 7: 'séptimo', 8: 'octavo', 9: 'noveno', 10: 'décimo', 11: 'undécimo', 12: 'duodécimo', 13: 'decimotercero', 14: 'decimocuarto', 15: 'decimoquinto', 16: 'decimosexto', 17: 'decimoséptimo', 18: 'decimoctavo', 19: 'decimonoveno', 20: 'vigésimo', 30: 'trigésimo', 40: "cuadragésimo", 50: "quincuagésimo", 60: "sexagésimo", 70: "septuagésimo", 80: "octogésimo", 90: "nonagésimo", 10e3: "centésimó", 1e3: "milésimo" } _SHORT_ORDINAL_STRING_ES = { 1e6: "millonésimo", 1e9: "milmillonésimo", 1e12: "billonésimo", 1e15: "milbillonésimo", 1e18: "trillonésimo", 1e21: "miltrillonésimo", 1e24: "cuatrillonésimo", 1e27: "milcuatrillonésimo", 1e30: "quintillonésimo", 1e33: "milquintillonésimo" # TODO > 1e-33 } _SHORT_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) _LONG_ORDINAL_STRING_ES = { 1e6: "millonésimo", 1e12: "billionth", 1e18: "trillonésimo", 1e24: "cuatrillonésimo", 1e30: "quintillonésimo", 1e36: "sextillonésimo", 1e42: "septillonésimo", 1e48: "octillonésimo", 1e54: "nonillonésimo", 1e60: "decillonésimo" # TODO > 1e60 } _LONG_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_eu.py000066400000000000000000000145511426211343400250750ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # NOTE: This file as no use yet. It needs to be called from other functions from collections import OrderedDict # _ARTICLES_ES = {'el', 'la', 'los', 'las'} _NUM_STRING_EU = { "zero": 0, "bat": 1, "bi": 2, "hiru": 3, "lau": 4, "bost": 5, "sei": 6, "zazpi": 7, "zortzi": 8, "bederatzi": 9, "hamar": 10, "hamaika": 11, "hamabi": 12, "hamahiru": 13, "hamalau": 14, "hamabost": 15, "hamasei": 16, "hamazazpi": 17, "hemezortzi": 18, "hemeretzi": 19, "hogei": 20, "hogeita hamar": 30, "hogeita hamaika": 31, "berrogei": 40, "berrogeita hamar": 50, "hirurogei": 60, "hirurogeita hamar": 70, "laurogei": 80, "laurogeita hamar": 90, "ehun": 100, "berrehun": 200, "hirurehun": 300, "laurehun": 400, "bostehun": 500, "seirehun": 600, "zazpirehun": 700, "zortzirehun": 800, "bederatzirehun": 900, "mila": 1000} _FRACTION_STRING_EU = { 2: 'erdia', 3: 'herena', 4: 'laurdena', 5: 'bostena', 6: 'seiena', 7: 'zazpiena', 8: 'zortziena', 9: 'noveno', 10: 'décimo', 11: 'onceavo', 12: 'doceavo', 13: 'treceavo', 14: 'catorceavo', 15: 'quinceavo', 16: 'dieciseisavo', 17: 'diecisieteavo', 18: 'dieciochoavo', 19: 'diecinueveavo', 20: 'veinteavo' } # https://www.grobauer.at/es_eur/zahlnamen.php _LONG_SCALE_EU = OrderedDict([ (100, 'ehuneko'), (1000, 'milaren'), (1000000, 'millón'), (1e9, "millardo"), (1e12, "billón"), (1e18, 'trillón'), (1e24, "cuatrillón"), (1e30, "quintillón"), (1e36, "sextillón"), (1e42, "septillón"), (1e48, "octillón"), (1e54, "nonillón"), (1e60, "decillón"), (1e66, "undecillón"), (1e72, "duodecillón"), (1e78, "tredecillón"), (1e84, "cuatrodecillón"), (1e90, "quindecillón"), (1e96, "sexdecillón"), (1e102, "septendecillón"), (1e108, "octodecillón"), (1e114, "novendecillón"), (1e120, "vigintillón"), (1e306, "unquinquagintillón"), (1e312, "duoquinquagintillón"), (1e336, "sexquinquagintillón"), (1e366, "unsexagintillón") ]) _SHORT_SCALE_EU = OrderedDict([ (100, 'ehuneko'), (1000, 'milaren'), (1000000, 'millón'), (1e9, "billón"), (1e12, 'trillón'), (1e15, "cuatrillón"), (1e18, "quintillón"), (1e21, "sextillón"), (1e24, "septillón"), (1e27, "octillón"), (1e30, "nonillón"), (1e33, "decillón"), (1e36, "undecillón"), (1e39, "duodecillón"), (1e42, "tredecillón"), (1e45, "cuatrodecillón"), (1e48, "quindecillón"), (1e51, "sexdecillón"), (1e54, "septendecillón"), (1e57, "octodecillón"), (1e60, "novendecillón"), (1e63, "vigintillón"), (1e66, "unvigintillón"), (1e69, "uuovigintillón"), (1e72, "tresvigintillón"), (1e75, "quattuorvigintillón"), (1e78, "quinquavigintillón"), (1e81, "qesvigintillón"), (1e84, "septemvigintillón"), (1e87, "octovigintillón"), (1e90, "novemvigintillón"), (1e93, "trigintillón"), (1e96, "untrigintillón"), (1e99, "duotrigintillón"), (1e102, "trestrigintillón"), (1e105, "quattuortrigintillón"), (1e108, "quinquatrigintillón"), (1e111, "sestrigintillón"), (1e114, "septentrigintillón"), (1e117, "octotrigintillón"), (1e120, "noventrigintillón"), (1e123, "quadragintillón"), (1e153, "quinquagintillón"), (1e183, "sexagintillón"), (1e213, "septuagintillón"), (1e243, "octogintillón"), (1e273, "nonagintillón"), (1e303, "centillón"), (1e306, "uncentillón"), (1e309, "duocentillón"), (1e312, "trescentillón"), (1e333, "decicentillón"), (1e336, "undecicentillón"), (1e363, "viginticentillón"), (1e366, "unviginticentillón"), (1e393, "trigintacentillón"), (1e423, "quadragintacentillón"), (1e453, "quinquagintacentillón"), (1e483, "sexagintacentillón"), (1e513, "septuagintacentillón"), (1e543, "ctogintacentillón"), (1e573, "nonagintacentillón"), (1e603, "ducentillón"), (1e903, "trecentillón"), (1e1203, "quadringentillón"), (1e1503, "quingentillón"), (1e1803, "sexcentillón"), (1e2103, "septingentillón"), (1e2403, "octingentillón"), (1e2703, "nongentillón"), (1e3003, "millinillón") ]) # TODO: female forms. _ORDINAL_STRING_BASE_EU = { 1: 'lehenengo', 2: 'bigarren', 3: 'hirugarren', 4: 'laugarren', 5: 'bostgarren', 6: 'seigarren', 7: 'séptimo', 8: 'octavo', 9: 'noveno', 10: 'décimo', 11: 'undécimo', 12: 'duodécimo', 13: 'decimotercero', 14: 'decimocuarto', 15: 'decimoquinto', 16: 'decimosexto', 17: 'decimoséptimo', 18: 'decimoctavo', 19: 'decimonoveno', 20: 'vigésimo', 30: 'trigésimo', 40: "cuadragésimo", 50: "quincuagésimo", 60: "sexagésimo", 70: "septuagésimo", 80: "octogésimo", 90: "nonagésimo", 10e3: "centésimó", 1e3: "milésimo" } _SHORT_ORDINAL_STRING_EU = { 1e6: "millonésimo", 1e9: "milmillonésimo", 1e12: "billonésimo", 1e15: "milbillonésimo", 1e18: "trillonésimo", 1e21: "miltrillonésimo", 1e24: "cuatrillonésimo", 1e27: "milcuatrillonésimo", 1e30: "quintillonésimo", 1e33: "milquintillonésimo" # TODO > 1e-33 } _SHORT_ORDINAL_STRING_EU.update(_ORDINAL_STRING_BASE_EU) _LONG_ORDINAL_STRING_EU = { 1e6: "millonésimo", 1e12: "billionth", 1e18: "trillonésimo", 1e24: "cuatrillonésimo", 1e30: "quintillonésimo", 1e36: "sextillonésimo", 1e42: "septillonésimo", 1e48: "octillonésimo", 1e54: "nonillonésimo", 1e60: "decillonésimo" # TODO > 1e60 } _LONG_ORDINAL_STRING_EU.update(_ORDINAL_STRING_BASE_EU) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_fa.py000066400000000000000000000044171426211343400250520ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict from .parse_common import invert_dict _FUNCTION_NOT_IMPLEMENTED_WARNING = "تابع خواسته شده در زبان فارسی پیاده سازی نشده است." _FRACTION_STRING_FA = { 2: 'دوم', 3: 'سوم', 4: 'چهارم', 5: 'پنجم', 6: 'ششم', 7: 'هفتم', 8: 'هشتم', 9: 'نهم', 10: 'دهم', 11: 'یازدهم', 12: 'دوازدهم', 13: 'سیزدهم', 14: 'چهاردهم', 15: 'پونزدهم', 16: 'شونزدهم', 17: 'هیفدهم', 18: 'هیجدهم', 19: 'نوزدهم', 20: 'بیستم' } _FARSI_ONES = [ "", "یک", "دو", "سه", "چهار", "پنج", "شش", "هفت", "هشت", "نه", "ده", "یازده", "دوازده", "سیزده", "چهارده", "پونزده", "شونزده", "هیفده", "هیجده", "نوزده", ] _FARSI_TENS = [ "", "ده", "بیست", "سی", "چهل", "پنجاه", "شصت", "هفتاد", "هشتاد", "نود", ] _FARSI_HUNDREDS = [ "", "صد", "دویست", "سیصد", "چهارصد", "پانصد", "ششصد", "هفتصد", "هشتصد", "نهصد", ] _FARSI_BIG = [ '', 'هزار', 'میلیون', "میلیارد", 'تریلیون', "تریلیارد", ] _FORMAL_VARIANT = { 'هفده': 'هیفده', 'هجده': 'هیجده', 'شانزده': 'شونزده', 'پانزده': 'پونزده', } _FARSI_FRAC = ["", "ده", "صد"] _FARSI_FRAC_BIG = ["", "هزار", "میلیونی", "میلیاردی"] _FARSI_SEPERATOR = ' و 'lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_fr.py000066400000000000000000000036101426211343400250650ustar00rootroot00000000000000# Undefined articles ["un", "une"] cannot be supressed, # in French, "un cheval" means "a horse" or "one horse". _ARTICLES_FR = ["le", "la", "du", "de", "les", "des"] _NUMBERS_FR = { "zéro": 0, "un": 1, "une": 1, "deux": 2, "trois": 3, "quatre": 4, "cinq": 5, "six": 6, "sept": 7, "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50, "soixante": 60, "soixante-dix": 70, "septante": 70, "quatre-vingt": 80, "quatre-vingts": 80, "octante": 80, "huitante": 80, "quatre-vingt-dix": 90, "nonante": 90, "cent": 100, "cents": 100, "mille": 1000, "mil": 1000, "millier": 1000, "milliers": 1000, "million": 1000000, "millions": 1000000, "milliard": 1000000000, "milliards": 1000000000} _ORDINAL_ENDINGS_FR = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e") _NUM_STRING_FR = { 0: 'zéro', 1: 'un', 2: 'deux', 3: 'trois', 4: 'quatre', 5: 'cinq', 6: 'six', 7: 'sept', 8: 'huit', 9: 'neuf', 10: 'dix', 11: 'onze', 12: 'douze', 13: 'treize', 14: 'quatorze', 15: 'quinze', 16: 'seize', 20: 'vingt', 30: 'trente', 40: 'quarante', 50: 'cinquante', 60: 'soixante', 70: 'soixante-dix', 80: 'quatre-vingt', 90: 'quatre-vingt-dix' } _FRACTION_STRING_FR = { 2: 'demi', 3: 'tiers', 4: 'quart', 5: 'cinquième', 6: 'sixième', 7: 'septième', 8: 'huitième', 9: 'neuvième', 10: 'dixième', 11: 'onzième', 12: 'douzième', 13: 'treizième', 14: 'quatorzième', 15: 'quinzième', 16: 'seizième', 17: 'dix-septième', 18: 'dix-huitième', 19: 'dix-neuvième', 20: 'vingtième' } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_hu.py000066400000000000000000000035301426211343400250730ustar00rootroot00000000000000_MONTHS_HU = ['január', 'február', 'március', 'április', 'május', 'június', 'július', 'augusztus', 'szeptember', 'október', 'november', 'december'] _NUM_STRING_HU = { 0: 'nulla', 1: 'egy', 2: 'kettő', 3: 'három', 4: 'négy', 5: 'öt', 6: 'hat', 7: 'hét', 8: 'nyolc', 9: 'kilenc', 10: 'tíz', 11: 'tizenegy', 12: 'tizenkettő', 13: 'tizenhárom', 14: 'tizennégy', 15: 'tizenöt', 16: 'tizenhat', 17: 'tizenhét', 18: 'tizennyolc', 19: 'tizenkilenc', 20: 'húsz', 30: 'harminc', 40: 'negyven', 50: 'ötven', 60: 'hatvan', 70: 'hetven', 80: 'nyolcvan', 90: 'kilencven', 100: 'száz' } # Hungarian uses "long scale" # https://en.wikipedia.org/wiki/Long_and_short_scales # Currently, numbers are limited to 1000000000000000000000000, # but _NUM_POWERS_OF_TEN can be extended to include additional number words _NUM_POWERS_OF_TEN = [ '', 'ezer', 'millió', 'milliárd', 'billió', 'billiárd', 'trillió', 'trilliárd' ] _FRACTION_STRING_HU = { 2: 'fél', 3: 'harmad', 4: 'negyed', 5: 'ötöd', 6: 'hatod', 7: 'heted', 8: 'nyolcad', 9: 'kilenced', 10: 'tized', 11: 'tizenegyed', 12: 'tizenketted', 13: 'tizenharmad', 14: 'tizennegyed', 15: 'tizenötöd', 16: 'tizenhatod', 17: 'tizenheted', 18: 'tizennyolcad', 19: 'tizenkilenced', 20: 'huszad' } # Numbers below 2 thousand are written in one word in Hungarian # Numbers above 2 thousand are separated by hyphens # In some circumstances it may better to seperate individual words # Set _EXTRA_SPACE_HU=" " for separating numbers below 2 thousand ( # orthographically incorrect) # Set _EXTRA_SPACE_HU="" for correct spelling, this is standard # _EXTRA_SPACE_HU = " " _EXTRA_SPACE_HU = "" lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_it.py000066400000000000000000000162431426211343400251000ustar00rootroot00000000000000import collections _SHORT_ORDINAL_STRING_IT = { 1: 'primo', 2: 'secondo', 3: 'terzo', 4: 'quarto', 5: 'quinto', 6: 'sesto', 7: 'settimo', 8: 'ottavo', 9: 'nono', 10: 'decimo', 11: 'undicesimo', 12: 'dodicesimo', 13: 'tredicesimo', 14: 'quattordicesimo', 15: 'quindicesimo', 16: 'sedicesimo', 17: 'diciassettesimo', 18: 'diciottesimo', 19: 'diciannovesimo', 20: 'ventesimo', 30: 'trentesimo', 40: 'quarantesimo', 50: 'cinquantesimo', 60: 'sessantesimo', 70: 'settantesimo', 80: 'ottantesimo', 90: 'novantesimo', 1e2: 'centesimo', 1e3: 'millesimo', 1e6: 'milionesimo', 1e9: 'miliardesimo', 1e12: 'trilionesimo', 1e15: 'quadrilionesimo', 1e18: 'quintilionesim', 1e21: 'sestilionesimo', 1e24: 'settilionesimo', 1e27: 'ottilionesimo', 1e30: 'nonilionesimo', 1e33: 'decilionesimo' # TODO > 1e-33 } # per i > 10e12 modificata solo la desinenza: da sistemare a fine debug _LONG_ORDINAL_STRING_IT = { 1: 'primo', 2: 'secondo', 3: 'terzo', 4: 'quarto', 5: 'quinto', 6: 'sesto', 7: 'settimo', 8: 'ottavo', 9: 'nono', 10: 'decimo', 11: 'undicesimo', 12: 'dodicesimo', 13: 'tredicesimo', 14: 'quattordicesimo', 15: 'quindicesimo', 16: 'sedicesimo', 17: 'diciassettesimo', 18: 'diciottesimo', 19: 'diciannovesimo', 20: 'ventesimo', 30: 'trentesimo', 40: 'quarantesimo', 50: 'cinquantesimo', 60: 'sessantesimo', 70: 'settantesimo', 80: 'ottantesimo', 90: 'novantesimo', 1e2: 'centesimo', 1e3: 'millesimo', 1e6: 'milionesimo', 1e12: 'bilionesimo', 1e18: 'trilionesimo', 1e24: 'quadrilionesimo', 1e30: 'quintilionesimo', 1e36: 'sestilionesimo', 1e42: 'settilionesimo', 1e48: 'ottilionesimo', 1e54: 'nonilionesimo', 1e60: 'decilionesimo' # TODO > 1e60 } # Undefined articles ['un', 'una', 'un\''] can not be supressed, # in Italian, 'un cavallo' means 'a horse' or 'one horse'. _ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le'] _STRING_NUM_IT = { 'zero': 0, 'un': 1, 'uno': 1, 'una': 1, 'un\'': 1, 'due': 2, 'tre': 3, 'quattro': 4, 'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9, 'dieci': 10, 'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14, 'quindici': 15, 'sedici': 16, 'diciassette': 17, 'diciotto': 18, 'diciannove': 19, 'venti': 20, 'vent': 20, 'trenta': 30, 'trent': 30, 'quaranta': 40, 'quarant': 40, 'cinquanta': 50, 'cinquant': 50, 'sessanta': 60, 'sessant': 60, 'settanta': 70, 'settant': 70, 'ottanta': 80, 'ottant': 80, 'novanta': 90, 'novant': 90, 'cento': 100, 'duecento': 200, 'trecento': 300, 'quattrocento': 400, 'cinquecento': 500, 'seicento': 600, 'settecento': 700, 'ottocento': 800, 'novecento': 900, 'mille': 1000, 'mila': 1000, 'centomila': 100000, 'milione': 1000000, 'miliardo': 1000000000, 'primo': 1, 'secondo': 2, 'mezzo': 0.5, 'mezza': 0.5, 'paio': 2, 'decina': 10, 'decine': 10, 'dozzina': 12, 'dozzine': 12, 'centinaio': 100, 'centinaia': 100, 'migliaio': 1000, 'migliaia': 1000 } _NUM_STRING_IT = { 0: 'zero', 1: 'uno', 2: 'due', 3: 'tre', 4: 'quattro', 5: 'cinque', 6: 'sei', 7: 'sette', 8: 'otto', 9: 'nove', 10: 'dieci', 11: 'undici', 12: 'dodici', 13: 'tredici', 14: 'quattordici', 15: 'quindici', 16: 'sedici', 17: 'diciassette', 18: 'diciotto', 19: 'diciannove', 20: 'venti', 30: 'trenta', 40: 'quaranta', 50: 'cinquanta', 60: 'sessanta', 70: 'settanta', 80: 'ottanta', 90: 'novanta' } _FRACTION_STRING_IT = { 2: 'mezz', 3: 'terz', 4: 'quart', 5: 'quint', 6: 'sest', 7: 'settim', 8: 'ottav', 9: 'non', 10: 'decim', 11: 'undicesim', 12: 'dodicesim', 13: 'tredicesim', 14: 'quattordicesim', 15: 'quindicesim', 16: 'sedicesim', 17: 'diciassettesim', 18: 'diciottesim', 19: 'diciannovesim', 20: 'ventesim' } # fonte: http://tulengua.es/numeros-texto/default.aspx _LONG_SCALE_IT = collections.OrderedDict([ (100, 'cento'), (1000, 'mila'), (1000000, 'milioni'), (1e9, "miliardi"), (1e12, "bilioni"), (1e18, 'trilioni'), (1e24, "quadrilioni"), (1e30, "quintilioni"), (1e36, "sestilioni"), (1e42, "settilioni"), (1e48, "ottillioni"), (1e54, "nonillioni"), (1e60, "decemillioni"), (1e66, "undicilione"), (1e72, "dodicilione"), (1e78, "tredicilione"), (1e84, "quattordicilione"), (1e90, "quindicilione"), (1e96, "sedicilione"), (1e102, "diciasettilione"), (1e108, "diciottilione"), (1e114, "dicianovilione"), (1e120, "vintilione"), (1e306, "unquinquagintilione"), (1e312, "duoquinquagintilione"), (1e336, "sesquinquagintilione"), (1e366, "unsexagintilione") ]) _SHORT_SCALE_IT = collections.OrderedDict([ (100, 'cento'), (1000, 'mila'), (1000000, 'milioni'), (1e9, "miliardi"), (1e12, 'bilioni'), (1e15, "biliardi"), (1e18, "trilioni"), (1e21, "triliardi"), (1e24, "quadrilioni"), (1e27, "quadriliardi"), (1e30, "quintilioni"), (1e33, "quintiliardi"), (1e36, "sestilioni"), (1e39, "sestiliardi"), (1e42, "settilioni"), (1e45, "settiliardi"), (1e48, "ottilioni"), (1e51, "ottiliardi"), (1e54, "nonilioni"), (1e57, "noniliardi"), (1e60, "decilioni"), (1e63, "deciliardi"), (1e66, "undicilioni"), (1e69, "undiciliardi"), (1e72, "dodicilioni"), (1e75, "dodiciliardi"), (1e78, "tredicilioni"), (1e81, "trediciliardi"), (1e84, "quattordicilioni"), (1e87, "quattordiciliardi"), (1e90, "quindicilioni"), (1e93, "quindiciliardi"), (1e96, "sedicilioni"), (1e99, "sediciliardi"), (1e102, "diciassettilioni"), (1e105, "diciassettiliardi"), (1e108, "diciottilioni"), (1e111, "diciottiliardi"), (1e114, "dicianovilioni"), (1e117, "dicianoviliardi"), (1e120, "vintilioni"), (1e123, "vintiliardi"), (1e153, "quinquagintillion"), (1e183, "sexagintillion"), (1e213, "septuagintillion"), (1e243, "ottogintilioni"), (1e273, "nonigintillioni"), (1e303, "centilioni"), (1e306, "uncentilioni"), (1e309, "duocentilioni"), (1e312, "trecentilioni"), (1e333, "decicentilioni"), (1e336, "undicicentilioni"), (1e363, "viginticentilioni"), (1e366, "unviginticentilioni"), (1e393, "trigintacentilioni"), (1e423, "quadragintacentillion"), (1e453, "quinquagintacentillion"), (1e483, "sexagintacentillion"), (1e513, "septuagintacentillion"), (1e543, "ctogintacentillion"), (1e573, "nonagintacentillion"), (1e603, "ducentillion"), (1e903, "trecentillion"), (1e1203, "quadringentillion"), (1e1503, "quingentillion"), (1e1803, "sescentillion"), (1e2103, "septingentillion"), (1e2403, "octingentillion"), (1e2703, "nongentillion"), (1e3003, "millinillion") ]) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_nl.py000066400000000000000000000200341426211343400250660ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict from .parse_common import invert_dict _ARTICLES_NL = {'de', 'het'} _NUM_STRING_NL = { 0: 'nul', 1: 'een', 2: 'twee', 3: 'drie', 4: 'vier', 5: 'vijf', 6: 'zes', 7: 'zeven', 8: 'acht', 9: 'negen', 10: 'tien', 11: 'elf', 12: 'twaalf', 13: 'dertien', 14: 'veertien', 15: 'vijftien', 16: 'zestien', 17: 'zeventien', 18: 'achttien', 19: 'negentien', 20: 'twintig', 30: 'dertig', 40: 'veertig', 50: 'vijftig', 60: 'zestig', 70: 'zeventig', 80: 'tachtig', 90: 'negentig' } _FRACTION_STRING_NL = { 2: 'half', 3: 'derde', 4: 'vierde', 5: 'vijfde', 6: 'zesde', 7: 'zevende', 8: 'achtste', 9: 'negende', 10: 'tiende', 11: 'elfde', 12: 'twaalfde', 13: 'dertiende', 14: 'veertiende', 15: 'vijftiende', 16: 'zestiende', 17: 'zeventiende', 18: 'achttiende', 19: 'negentiende', 20: 'twintigste' } _LONG_SCALE_NL = OrderedDict([ (100, 'honderd'), (1000, 'duizend'), (1000000, 'miljoen'), (1e12, "biljoen"), (1e18, 'triljoen'), (1e24, "quadriljoen"), (1e30, "quintillion"), (1e36, "sextillion"), (1e42, "septillion"), (1e48, "octillion"), (1e54, "nonillion"), (1e60, "decillion"), (1e66, "undecillion"), (1e72, "duodecillion"), (1e78, "tredecillion"), (1e84, "quattuordecillion"), (1e90, "quinquadecillion"), (1e96, "sedecillion"), (1e102, "septendecillion"), (1e108, "octodecillion"), (1e114, "novendecillion"), (1e120, "vigintillion"), (1e306, "unquinquagintillion"), (1e312, "duoquinquagintillion"), (1e336, "sesquinquagintillion"), (1e366, "unsexagintillion") ]) _SHORT_SCALE_NL = OrderedDict([ (100, 'honderd'), (1000, 'duizend'), (1000000, 'miljoen'), (1e9, "miljard"), (1e12, 'biljoen'), (1e15, "quadrillion"), (1e18, "quintiljoen"), (1e21, "sextiljoen"), (1e24, "septiljoen"), (1e27, "octiljoen"), (1e30, "noniljoen"), (1e33, "deciljoen"), (1e36, "undeciljoen"), (1e39, "duodeciljoen"), (1e42, "tredeciljoen"), (1e45, "quattuordeciljoen"), (1e48, "quinquadeciljoen"), (1e51, "sedeciljoen"), (1e54, "septendeciljoen"), (1e57, "octodeciljoen"), (1e60, "novendeciljoen"), (1e63, "vigintiljoen"), (1e66, "unvigintiljoen"), (1e69, "uuovigintiljoen"), (1e72, "tresvigintiljoen"), (1e75, "quattuorvigintiljoen"), (1e78, "quinquavigintiljoen"), (1e81, "qesvigintiljoen"), (1e84, "septemvigintiljoen"), (1e87, "octovigintiljoen"), (1e90, "novemvigintiljoen"), (1e93, "trigintiljoen"), (1e96, "untrigintiljoen"), (1e99, "duotrigintiljoen"), (1e102, "trestrigintiljoen"), (1e105, "quattuortrigintiljoen"), (1e108, "quinquatrigintiljoen"), (1e111, "sestrigintiljoen"), (1e114, "septentrigintiljoen"), (1e117, "octotrigintiljoen"), (1e120, "noventrigintiljoen"), (1e123, "quadragintiljoen"), (1e153, "quinquagintiljoen"), (1e183, "sexagintiljoen"), (1e213, "septuagintiljoen"), (1e243, "octogintiljoen"), (1e273, "nonagintiljoen"), (1e303, "centiljoen"), (1e306, "uncentiljoen"), (1e309, "duocentiljoen"), (1e312, "trescentiljoen"), (1e333, "decicentiljoen"), (1e336, "undecicentiljoen"), (1e363, "viginticentiljoen"), (1e366, "unviginticentiljoen"), (1e393, "trigintacentiljoen"), (1e423, "quadragintacentiljoen"), (1e453, "quinquagintacentiljoen"), (1e483, "sexagintacentiljoen"), (1e513, "septuagintacentiljoen"), (1e543, "ctogintacentiljoen"), (1e573, "nonagintacentiljoen"), (1e603, "ducentiljoen"), (1e903, "trecentiljoen"), (1e1203, "quadringentiljoen"), (1e1503, "quingentiljoen"), (1e1803, "sescentiljoen"), (1e2103, "septingentiljoen"), (1e2403, "octingentiljoen"), (1e2703, "nongentiljoen"), (1e3003, "milliniljoen") ]) _ORDINAL_STRING_BASE_NL = { 1: 'eerste', 2: 'tweede', 3: 'derde', 4: 'vierde', 5: 'vijfde', 6: 'zesde', 7: 'zevende', 8: 'achtste', 9: 'negende', 10: 'tiende', 11: 'elfde', 12: 'twaalfde', 13: 'dertiende', 14: 'veertiende', 15: 'vijftiende', 16: 'zestiende', 17: 'zeventiende', 18: 'achttiende', 19: 'negentiende', 20: 'twintigste', 30: 'dertigste', 40: "veertigste", 50: "vijftigste", 60: "zestigste", 70: "zeventigste", 80: "tachtigste", 90: "negentigste", 10e3: "honderdste", 1e3: "duizendste" } _SHORT_ORDINAL_STRING_NL = { 1e6: "miloenste", 1e9: "miljardste", 1e12: "biljoenste", 1e15: "biljardste", 1e18: "triljoenste", 1e21: "trijardste", 1e24: "quadriljoenste", 1e27: "quadriljardste", 1e30: "quintiljoenste", 1e33: "quintiljardste" # TODO > 1e-33 } _SHORT_ORDINAL_STRING_NL.update(_ORDINAL_STRING_BASE_NL) _LONG_ORDINAL_STRING_NL = { 1e6: "miloenste", 1e9: "miljardste", 1e12: "biljoenste", 1e15: "biljardste", 1e18: "triljoenste", 1e21: "trijardste", 1e24: "quadriljoenste", 1e27: "quadriljardste", 1e30: "quintiljoenste", 1e33: "quintiljardste" # TODO > 1e60 } _LONG_ORDINAL_STRING_NL.update(_ORDINAL_STRING_BASE_NL) # negate next number (-2 = 0 - 2) _NEGATIVES_NL = {"min", "minus"} # sum the next number (twenty two = 20 + 2) _SUMS_NL = {'twintig', '20', 'dertig', '30', 'veertig', '40', 'vijftig', '50', 'zestig', '60', 'zeventig', '70', 'techtig', '80', 'negentig', '90'} _MULTIPLIES_LONG_SCALE_NL = set(_LONG_SCALE_NL.values()) _MULTIPLIES_SHORT_SCALE_NL = set(_SHORT_SCALE_NL.values()) # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) _FRACTION_MARKER_NL = {"en"} # decimal marker ( 1 point 5 = 1 + 0.5) _DECIMAL_MARKER_NL = {"komma", "punt"} _STRING_NUM_NL = invert_dict(_NUM_STRING_NL) _STRING_NUM_NL.update({ "half": 0.5, "driekwart": 0.75, "anderhalf": 1.5, "paar": 2 }) _STRING_SHORT_ORDINAL_NL = invert_dict(_SHORT_ORDINAL_STRING_NL) _STRING_LONG_ORDINAL_NL = invert_dict(_LONG_ORDINAL_STRING_NL) _MONTHS_NL = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december'] _NUM_STRING_NL = { 0: 'nul', 1: 'één', 2: 'twee', 3: 'drie', 4: 'vier', 5: 'vijf', 6: 'zes', 7: 'zeven', 8: 'acht', 9: 'negen', 10: 'tien', 11: 'elf', 12: 'twaalf', 13: 'dertien', 14: 'veertien', 15: 'vijftien', 16: 'zestien', 17: 'zeventien', 18: 'actien', 19: 'negentien', 20: 'twintig', 30: 'dertig', 40: 'veertig', 50: 'vijftig', 60: 'zestig', 70: 'zeventig', 80: 'tachtig', 90: 'negentig', 100: 'honderd' } # Dutch uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales # Currently, numbers are limited to 1000000000000000000000000, # but _NUM_POWERS_OF_TEN can be extended to include additional number words _NUM_POWERS_OF_TEN = [ '', 'duizend', 'miljoen', 'miljard', 'biljoen', 'biljard', 'triljoen', 'triljard' ] # Numbers below 1 million are written in one word in dutch, yielding very # long words # In some circumstances it may better to seperate individual words # Set _EXTRA_SPACE_NL=" " for separating numbers below 1 million ( # orthographically incorrect) # Set _EXTRA_SPACE_NL="" for correct spelling, this is standard # _EXTRA_SPACE_NL = " " _EXTRA_SPACE_NL = "" lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_pl.py000066400000000000000000000263531426211343400251020ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict _NUM_STRING_PL = { 0: 'zero', 1: 'jeden', 2: 'dwa', 3: 'trzy', 4: 'cztery', 5: 'pięć', 6: 'sześć', 7: 'siedem', 8: 'osiem', 9: 'dziewięć', 10: 'dziesięć', 11: 'jedenaście', 12: 'dwanaście', 13: 'trzynaście', 14: 'czternaście', 15: 'piętnaście', 16: 'szesnaście', 17: 'siedemnaście', 18: 'osiemnaście', 19: 'dziewiętnaście', 20: 'dwadzieścia', 30: 'trzydzieści', 40: 'czterdzieści', 50: 'pięćdziesiąt', 60: 'sześćdziesiąt', 70: 'siedemdziesiąt', 80: 'osiemdziesiąt', 90: 'dziewięćdziesiąt', 100: 'sto', 200: 'dwieście', 300: 'trzysta', 400: 'czterysta', 500: 'pięćset', 600: 'sześćset', 700: 'siedemset', 800: 'osiemset', 900: 'dziewięćset', } _FRACTION_STRING_PL = { 1: 'jedna', 2: 'druga', 3: 'trzecia', 4: 'czwarta', 5: 'piąta', 6: 'szósta', 7: 'siódma', 8: 'ósma', 9: 'dziewiąta', 10: 'dziesiąta', 11: 'jedenasta', 12: 'dwunasta', 13: 'trzynasta', 14: 'czternasta', 15: 'piętnasta', 16: 'szesnasta', 17: 'siedemnasta', 18: 'osiemnasta', 19: 'dziewiętnasta', 20: 'dwudziesta', 30: 'trzydziesta', 40: 'czterdziesta', 50: 'pięćdziesiąta', 60: 'sześćdziesiąta', 70: 'siedemdziesiąta', 80: 'osiemdziesiąta', 90: 'dziewięćdziesiąta', 100: 'setna', 200: 'dwusetna', 300: 'trzysetna', 400: 'czterysetna', 500: 'pięćsetna', 600: 'sześćsetna', 700: 'siedemsetna', 800: 'osiemsetna', 900: 'dziewięćsetna', 1000: 'tysięczna', } _SHORT_SCALE_PL = OrderedDict([ (100, 'sto'), (200, 'dwieście'), (300, 'trzysta'), (400, 'czterysta'), (500, 'pięćset'), (600, 'sześćset'), (700, 'siedemset'), (800, 'osiemset'), (900, 'dziewięćset'), (1000, 'tysiąc'), (1000000, 'milion'), (1e9, "miliard"), (1e12, 'bilion'), (1e15, "biliard"), (1e18, "trylion"), (1e21, "sekstilion"), (1e24, "kwadrylion"), (1e27, "kwadryliard"), (1e30, "kwintylion"), (1e33, "kwintyliard"), (1e36, "sekstylion"), (1e39, "sekstyliard"), (1e42, "septylion"), (1e45, "septyliard"), (1e48, "oktylion"), (1e51, "oktyliard"), (1e54, "nonilion"), (1e57, "noniliard"), (1e60, "decylion"), (1e63, "decyliard"), (1e66, "undecylion"), (1e69, "undecyliard"), (1e72, "duodecylion"), (1e75, "duodecyliard"), (1e78, "tredecylion"), (1e81, "tredecyliard"), (1e84, "kwartyduodecylion"), (1e87, "kwartyduodecyliard"), (1e90, "kwintyduodecylion"), (1e93, "kwintyduodecyliard"), (1e96, "seksdecylion"), (1e99, "seksdecyliard"), (1e102, "septydecylion"), (1e105, "septydecyliard"), (1e108, "oktodecylion"), (1e111, "oktodecyliard"), (1e114, "nondecylion"), (1e117, "nondecyliard"), (1e120, "wigintylion"), (1e123, "wigintyliard"), (1e153, "quinquagintylion"), (1e183, "trycyliard"), (1e213, "septuagintylion"), (1e243, "kwadragiliard"), (1e273, "nonagintylion"), (1e303, "centezylion"), (1e306, "uncentylion"), (1e309, "duocentylion"), (1e312, "trescentylion"), (1e333, "decicentylion"), (1e336, "undecicentylion"), (1e363, "viginticentylion"), (1e366, "unviginticentylion"), (1e393, "trigintacentylion"), (1e423, "quadragintacentylion"), (1e453, "quinquagintacentylion"), (1e483, "sexagintacentylion"), (1e513, "septuagintacentylion"), (1e543, "ctogintacentylion"), (1e573, "nonagintacentylion"), (1e603, "centyliard"), (1e903, "trecentylion"), (1e1203, "quadringentylion"), (1e1503, "quingentylion"), (1e1803, "sescentylion"), (1e2103, "septingentylion"), (1e2403, "octingentylion"), (1e2703, "nongentylion"), (1e3003, "milinylion") ]) _ORDINAL_BASE_PL = { 1: 'pierwszy', 2: 'drugi', 3: 'trzeci', 4: 'czwarty', 5: 'piąty', 6: 'szósty', 7: 'siódmy', 8: 'ósmy', 9: 'dziewiąty', 10: 'dziesiąty', 11: 'jedenasty', 12: 'dwunasty', 13: 'trzynasty', 14: 'czternasty', 15: 'piętnasty', 16: 'szesnasty', 17: 'siedemnasty', 18: 'osiemnasty', 19: 'dziewiętnasty', 20: 'dwudziesty', 30: 'trzydziesty', 40: "czterdziesty", 50: "pięćdziesiąty", 60: "sześćdziesiąty", 70: "siedemdziesiąty", 80: "osiemdziesiąty", 90: "dziewięćdziesiąty", 1e2: "setny", 1e3: "tysięczny" } _SHORT_ORDINAL_PL = { 1e6: "milionowy", 1e9: "miliardowy", 1e12: "bilionowy", 1e15: "biliardowy", 1e18: "trylionowy", 1e21: "tryliardowy", 1e24: "kwadrylionowy", 1e27: "kwadryliardowy", 1e30: "kwintylionowy", 1e33: "kwintyliardowy", 1e36: "sektylionowy", 1e42: "septylionowy", 1e48: "oktylionowy", 1e54: "nonylionowy", 1e60: "decylionowy" # TODO > 1e-33 } _SHORT_ORDINAL_PL.update(_ORDINAL_BASE_PL) _ALT_ORDINALS_PL = { 1: 'pierwszej', 2: 'drugiej', 3: 'trzeciej', 4: 'czwartej', 5: 'piątej', 6: 'szóstej', 7: 'siódmej', 8: 'ósmej', 9: 'dziewiątej', 10: 'dziesięcio', 11: 'jedenasto', 12: 'dwunasto', 13: 'trzynasto', 14: 'czternasto', 15: 'piętnasto', 16: 'szesnasto', 17: 'siedemnasto', 18: 'osiemnasto', 19: 'dziewiętnasto', 20: 'dwudziesto', 30: 'trzydziesto', 40: 'czterdziesto', 50: 'pięćdziesiecio', 60: 'sześćdziesięcio', 70: 'siedemdziesięcio', 80: 'osiemdziesięcio', 90: 'dziewięćdziesięcio', } _TIME_UNITS_CONVERSION = { 'mikrosekund': 'microseconds', 'mikrosekundy': 'microseconds', 'milisekund': 'milliseconds', 'milisekundy': 'milliseconds', 'sekunda': 'seconds', 'sekundy': 'seconds', 'sekund': 'seconds', 'minuta': 'minutes', 'minuty': 'minutes', 'minut': 'minutes', 'godzina': 'hours', 'godziny': 'hours', 'godzin': 'hours', 'dzień': 'days', 'dni': 'days', 'tydzień': 'weeks', 'tygodni': 'weeks', 'tygodnie': 'weeks', 'tygodniu': 'weeks', } _TIME_UNITS_NORMALIZATION = { 'mikrosekunda': 'mikrosekunda', 'mikrosekundę': 'mikrosekunda', 'mikrosekund': 'mikrosekunda', 'mikrosekundy': 'mikrosekunda', 'milisekunda': 'milisekunda', 'milisekundę': 'milisekunda', 'milisekund': 'milisekunda', 'milisekundy': 'milisekunda', 'sekunda': 'sekunda', 'sekundę': 'sekunda', 'sekundy': 'sekunda', 'sekund': 'sekunda', 'minuta': 'minuta', 'minutę': 'minuta', 'minut': 'minuta', 'minuty': 'minuta', 'godzina': 'godzina', 'godzinę': 'godzina', 'godzin': 'godzina', 'godziny': 'godzina', 'dzień': 'dzień', 'dni': 'dzień', 'tydzień': 'tydzień', 'tygodni': 'tydzień', 'tygodnie': 'tydzień', 'tygodniu': 'tydzień', 'miesiąc': 'miesiąc', 'miesiące': 'miesiąc', 'miesięcy': 'miesiąc', 'rok': 'rok', 'lata': 'rok', 'lat': 'rok', 'dekada': 'dekada', 'dekad': 'dekada', 'dekady': 'dekada', 'dekadę': 'dekada', 'wiek': 'wiek', 'wieki': 'wiek', 'milenia': 'milenia', 'milenium': 'milenia', } _MONTHS_TO_EN = { 'styczeń': 'January', 'stycznia': 'January', 'luty': 'February', 'lutego': 'February', 'marzec': 'March', 'marca': 'March', 'kwiecień': 'April', 'kwietnia': 'April', 'maj': 'May', 'maja': 'May', 'czerwiec': 'June', 'czerwca': 'June', 'lipiec': 'July', 'lipca': 'July', 'sierpień': 'August', 'sierpnia': 'August', 'wrzesień': 'September', 'września': 'September', 'październik': 'October', 'października': 'October', 'listopad': 'November', 'listopada': 'November', 'grudzień': 'December', 'grudnia': 'December', } _DAYS_TO_EN = { 'poniedziałek': 0, 'poniedziałkach': 0, 'poniedziałkami': 0, 'poniedziałki': 0, 'poniedziałkiem': 0, 'poniedziałkom': 0, 'poniedziałkowa': 0, 'poniedziałkową': 0, 'poniedziałkowe': 0, 'poniedziałkowego': 0, 'poniedziałkowej': 0, 'poniedziałkowemu': 0, 'poniedziałkowi': 0, 'poniedziałkowy': 0, 'poniedziałkowych': 0, 'poniedziałkowym': 0, 'poniedziałkowymi': 0, 'poniedziałków': 0, 'poniedziałku': 0, 'wtorek': 1, 'wtorkach': 1, 'wtorkami': 1, 'wtorki': 1, 'wtorkiem': 1, 'wtorkom': 1, 'wtorkowa': 1, 'wtorkową': 1, 'wtorkowe': 1, 'wtorkowego': 1, 'wtorkowej': 1, 'wtorkowemu': 1, 'wtorkowi': 1, 'wtorkowy': 1, 'wtorkowych': 1, 'wtorkowym': 1, 'wtorkowymi': 1, 'wtorków': 1, 'wtorku': 1, 'środa': 2, 'środach': 2, 'środami': 2, 'środą': 2, 'środę': 2, 'środo': 2, 'środom': 2, 'środowa': 2, 'środową': 2, 'środowe': 2, 'środowego': 2, 'środowej': 2, 'środowemu': 2, 'środowi': 2, 'środowy': 2, 'środowych': 2, 'środowym': 2, 'środowymi': 2, 'środy': 2, 'środzie': 2, 'śród': 2, 'czwartek': 3, 'czwartkach': 3, 'czwartkami': 3, 'czwartki': 3, 'czwartkiem': 3, 'czwartkom': 3, 'czwartkowa': 3, 'czwartkową': 3, 'czwartkowe': 3, 'czwartkowego': 3, 'czwartkowej': 3, 'czwartkowemu': 3, 'czwartkowi': 3, 'czwartkowy': 3, 'czwartkowych': 3, 'czwartkowym': 3, 'czwartkowymi': 3, 'czwartków': 3, 'czwartku': 3, 'piątek': 4, 'piątkach': 4, 'piątkami': 4, 'piątki': 4, 'piątkiem': 4, 'piątkom': 4, 'piątkowa': 4, 'piątkową': 4, 'piątkowe': 4, 'piątkowego': 4, 'piątkowej': 4, 'piątkowemu': 4, 'piątkowi': 4, 'piątkowy': 4, 'piątkowych': 4, 'piątkowym': 4, 'piątkowymi': 4, 'piątków': 4, 'piątku': 4, 'sobocie': 5, 'sobota': 5, 'sobotach': 5, 'sobotami': 5, 'sobotą': 5, 'sobotę': 5, 'sobotni': 5, 'sobotnia': 5, 'sobotnią': 5, 'sobotnich': 5, 'sobotnie': 5, 'sobotniego': 5, 'sobotniej': 5, 'sobotniemu': 5, 'sobotnim': 5, 'sobotnimi': 5, 'soboto': 5, 'sobotom': 5, 'soboty': 5, 'sobót': 5, 'niedziel': 6, 'niedziela': 6, 'niedzielach': 6, 'niedzielami': 6, 'niedzielą': 6, 'niedziele': 6, 'niedzielę': 6, 'niedzieli': 6, 'niedzielna': 6, 'niedzielną': 6, 'niedzielne': 6, 'niedzielnego': 6, 'niedzielnej': 6, 'niedzielnemu': 6, 'niedzielni': 6, 'niedzielny': 6, 'niedzielnych': 6, 'niedzielnym': 6, 'niedzielnymi': 6, 'niedzielo': 6, 'niedzielom': 6 } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_pt.py000066400000000000000000000052241426211343400251040ustar00rootroot00000000000000_FUNCTION_NOT_IMPLEMENTED_WARNING = "esta função não foi implementada em 'pt'" # Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, # in PT, "um cavalo" means "a horse" or "one horse". _ARTICLES_PT = ["o", "a", "os", "as"] # word rules for gender _FEMALE_ENDINGS_PT = ["a", "as"] _MALE_ENDINGS_PT = ["o", "os"] # special cases, word lookup for words not covered by above rule _GENDERS_PT = { "mulher": "f", "mulheres": "f", "homem": "m" } # context rules for gender _MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"] _FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"] _NUMBERS_PT = { "zero": 0, "um": 1, "uma": 1, "uns": 1, "umas": 1, "primeiro": 1, "segundo": 2, "terceiro": 3, "dois": 2, "duas": 2, "tres": 3, "três": 3, "quatro": 4, "cinco": 5, "seis": 6, "sete": 7, "oito": 8, "nove": 9, "dez": 10, "onze": 11, "doze": 12, "treze": 13, "catorze": 14, "quinze": 15, "dezasseis": 16, "dezassete": 17, "dezoito": 18, "dezanove": 19, "vinte": 20, "trinta": 30, "quarenta": 40, "cinquenta": 50, "sessenta": 60, "setenta": 70, "oitenta": 80, "noventa": 90, "cem": 100, "cento": 100, "duzentos": 200, "duzentas": 200, "trezentos": 300, "trezentas": 300, "quatrocentos": 400, "quatrocentas": 400, "quinhentos": 500, "quinhentas": 500, "seiscentos": 600, "seiscentas": 600, "setecentos": 700, "setecentas": 700, "oitocentos": 800, "oitocentas": 800, "novecentos": 900, "novecentas": 900, "mil": 1000, "milhão": 1000000} _FRACTION_STRING_PT = { 2: 'meio', 3: 'terço', 4: 'quarto', 5: 'quinto', 6: 'sexto', 7: 'sétimo', 8: 'oitavo', 9: 'nono', 10: 'décimo', 11: 'onze avos', 12: 'doze avos', 13: 'treze avos', 14: 'catorze avos', 15: 'quinze avos', 16: 'dezasseis avos', 17: 'dezassete avos', 18: 'dezoito avos', 19: 'dezanove avos', 20: 'vigésimo', 30: 'trigésimo', 100: 'centésimo', 1000: 'milésimo' } _NUM_STRING_PT = { 0: 'zero', 1: 'um', 2: 'dois', 3: 'três', 4: 'quatro', 5: 'cinco', 6: 'seis', 7: 'sete', 8: 'oito', 9: 'nove', 10: 'dez', 11: 'onze', 12: 'doze', 13: 'treze', 14: 'catorze', 15: 'quinze', 16: 'dezasseis', 17: 'dezassete', 18: 'dezoito', 19: 'dezanove', 20: 'vinte', 30: 'trinta', 40: 'quarenta', 50: 'cinquenta', 60: 'sessenta', 70: 'setenta', 80: 'oitenta', 90: 'noventa' } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_ru.py000066400000000000000000000220041426211343400251020ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict _NUM_STRING_RU = { 0: 'ноль', 1: 'один', 2: 'два', 3: 'три', 4: 'четыре', 5: 'пять', 6: 'шесть', 7: 'семь', 8: 'восемь', 9: 'девять', 10: 'десять', 11: 'одиннадцать', 12: 'двенадцать', 13: 'тринадцать', 14: 'четырнадцать', 15: 'пятнадцать', 16: 'шестнадцать', 17: 'семнадцать', 18: 'восемнадцать', 19: 'девятнадцать', 20: 'двадцать', 30: 'тридцать', 40: 'сорок', 50: 'пятьдесят', 60: 'шестьдесят', 70: 'семьдесят', 80: 'восемьдесят', 90: 'девяносто', 100: 'сто', 200: 'двести', 300: 'триста', 400: 'четыреста', 500: 'пятьсот', 600: 'шестьсот', 700: 'семьсот', 800: 'восемьсот', 900: 'девятьсот' } _FRACTION_STRING_RU = { 2: 'половина', 3: 'треть', 4: 'четверть', 5: 'пятая', 6: 'шестая', 7: 'седьмая', 8: 'восьмая', 9: 'девятая', 10: 'десятая', 11: 'одиннадцатая', 12: 'двенадцатая', 13: 'тринадцатая', 14: 'четырнадцатая', 15: 'пятнадцатая', 16: 'шестнадцатая', 17: 'семнадцатая', 18: 'восемнадцатая', 19: 'девятнадцатая', 20: 'двадцатая', 30: 'тридцатая', 40: 'сороковая', 50: 'пятидесятая', 60: 'шестидесятая', 70: 'семидесятая', 80: 'восьмидесятая', 90: 'девяностая', 1e2: 'сотая', 1e3: 'тысячная', 1e6: 'миллионная', 1e9: 'миллиардная' } _SHORT_SCALE_RU = OrderedDict([ (1e3, 'тысяча'), (1e6, "миллион"), (1e9, "миллиард"), (1e12, "триллион"), (1e15, "квадриллион"), (1e18, "квинтиллион"), (1e21, "секстиллион"), (1e24, "септиллион"), (1e27, "октиллион"), (1e30, "нониллион"), (1e33, "дециллион"), (1e36, "ундециллион"), (1e39, "дуодециллион"), (1e42, "тредециллион"), (1e45, "кваттордециллион"), (1e48, "квиндециллион"), (1e51, "сексдециллион"), (1e54, "септендециллион"), (1e57, "октодециллион"), (1e60, "новемдециллион"), (1e63, "вигинтиллион"), (1e66, "унвигинтиллион"), (1e69, "дуовигинтиллион"), (1e72, "тревигинтиллион"), (1e75, "кватторвигинтиллион"), (1e78, "квинвигинтиллион"), (1e81, "секснвигинтиллион"), (1e84, "септенвигинтиллион"), (1e87, "октовигинтиллион"), (1e90, "новемвигинтиллион"), (1e93, "тригинтиллион"), ]) _LONG_SCALE_RU = OrderedDict([ (1e3, 'тысяча'), (1e6, "миллион"), (1e9, "миллиард"), (1e12, "биллион"), (1e15, "биллиард"), (1e18, "триллион"), (1e21, "триллиард"), (1e24, "квадриллион"), (1e27, "квадриллиард"), (1e30, "квинтиллион"), (1e33, "квинтиллиард"), (1e36, "секстиллион"), (1e39, "секстиллиард"), (1e42, "септиллион"), (1e45, "септиллиард"), (1e48, "октиллион"), (1e51, "октиллиард"), (1e54, "нониллион"), (1e57, "нониллиард"), (1e60, "дециллион"), (1e63, "дециллиард"), (1e66, "ундециллион"), (1e72, "дуодециллион"), (1e78, "тредециллион"), (1e84, "кваттордециллион"), (1e90, "квиндециллион"), (1e96, "сексдециллион"), (1e102, "септендециллион"), (1e108, "октодециллион"), (1e114, "новемдециллион"), (1e120, "вигинтиллион"), ]) _ORDINAL_BASE_RU = { 1: 'первый', 2: 'второй', 3: 'третий', 4: 'четвёртый', 5: 'пятый', 6: 'шестой', 7: 'седьмой', 8: 'восьмой', 9: 'девятый', 10: 'десятый', 11: 'одиннадцатый', 12: 'двенадцатый', 13: 'тринадцатый', 14: 'четырнадцатый', 15: 'пятнадцатый', 16: 'шестнадцатый', 17: 'семнадцатый', 18: 'восемнадцатый', 19: 'девятнадцатый', 20: 'двадцатый', 30: 'тридцатый', 40: "сороковой", 50: "пятидесятый", 60: "шестидесятый", 70: "семидесятый", 80: "восьмидесятый", 90: "девяностый", 1e2: "сотый", 2e2: "двухсотый", 3e2: "трёхсотый", 4e2: "четырёхсотый", 5e2: "пятисотый", 6e2: "шестисотый", 7e2: "семисотый", 8e2: "восьмисотый", 9e2: "девятисотый", 1e3: "тысячный" } _SHORT_ORDINAL_RU = { 1e6: "миллион", 1e9: "миллиард", 1e12: "триллион", 1e15: "квадриллион", 1e18: "квинтиллион", 1e21: "секстиллион", 1e24: "септиллион", 1e27: "октиллион", 1e30: "нониллион", 1e33: "дециллион", 1e36: "ундециллион", 1e39: "дуодециллион", 1e42: "тредециллион", 1e45: "кваттордециллион", 1e48: "квиндециллион", 1e51: "сексдециллион", 1e54: "септендециллион", 1e57: "октодециллион", 1e60: "новемдециллион", 1e63: "вигинтиллион" } _SHORT_ORDINAL_RU.update(_ORDINAL_BASE_RU) _LONG_ORDINAL_RU = { 1e6: "миллион", 1e9: "миллиард", 1e12: "биллион", 1e15: "биллиард", 1e18: "триллион", 1e21: "триллиард", 1e24: "квадриллион", 1e27: "квадриллиард", 1e30: "квинтиллион", 1e33: "квинтиллиард", 1e36: "секстиллион", 1e39: "секстиллиард", 1e42: "септиллион", 1e45: "септиллиард", 1e48: "октиллион", 1e51: "октиллиард", 1e54: "нониллион", 1e57: "нониллиард", 1e60: "дециллион", 1e63: "дециллиард", 1e66: "ундециллион", 1e72: "дуодециллион", 1e78: "тредециллион", 1e84: "кваттордециллион", 1e90: "квиндециллион", 1e96: "сексдециллион", 1e102: "септендециллион", 1e108: "октодециллион", 1e114: "новемдециллион", 1e120: "вигинтиллион" } _LONG_ORDINAL_RU.update(_ORDINAL_BASE_RU) # Months _MONTHS_CONVERSION = { 0: "january", 1: "february", 2: "march", 3: "april", 4: "may", 5: "june", 6: "july", 7: "august", 8: "september", 9: "october", 10: "november", 11: "december" } _MONTHS_RU = ['январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь'] # Time _TIME_UNITS_CONVERSION = { 'микросекунд': 'microseconds', 'милисекунд': 'milliseconds', 'секунда': 'seconds', 'секунды': 'seconds', 'секунд': 'seconds', 'минута': 'minutes', 'минуты': 'minutes', 'минут': 'minutes', 'час': 'hours', 'часа': 'hours', 'часов': 'hours', 'день': 'days', 'дня': 'days', 'дней': 'days', 'неделя': 'weeks', 'недели': 'weeks', 'недель': 'weeks' } lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_sl.py000066400000000000000000000067761426211343400251140ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import OrderedDict _ARTICLES_SL = {} _NUM_STRING_SL = { 0: 'nič', 1: 'ena', 2: 'dve', 3: 'tri', 4: 'štiri', 5: 'pet', 6: 'šest', 7: 'sedem', 8: 'osem', 9: 'devet', 10: 'deset', 11: 'enajst', 12: 'dvanajst', 13: 'trinajst', 14: 'štirinajst', 15: 'petnajst', 16: 'šestnajst', 17: 'sedemnajst', 18: 'osemnajst', 19: 'devetnajst', 20: 'dvajset', 30: 'trideset', 40: 'štirideset', 50: 'petdeset', 60: 'šestdeset', 70: 'sedemdeset', 80: 'osemdeset', 90: 'devetdeset' } _FRACTION_STRING_SL = { 2: 'polovica', 3: 'tretjina', 4: 'četrtina', 5: 'petina', 6: 'šestina', 7: 'sedmina', 8: 'osmina', 9: 'devetina', 10: 'desetina', 11: 'enajstina', 12: 'dvanajstina', 13: 'trinajstina', 14: 'štirinajstina', 15: 'petnajstina', 16: 'šestnajstina', 17: 'sedemnajstina', 18: 'osemnajstina', 19: 'devetnajstina', 20: 'dvajsetina' } _LONG_SCALE_SL = OrderedDict([ (100, 'sto'), (1000, 'tisoč'), (1000000, 'milijon'), (1e12, 'bilijon'), (1e18, 'trilijon'), (1e24, 'kvadrilijon'), (1e30, 'kvintilijon'), (1e36, 'sekstilijon'), (1e42, 'septilijon'), (1e48, 'oktilijon'), (1e54, 'nonilijon'), (1e60, 'decilijon') # TODO > 1e63 ]) _SHORT_SCALE_SL = OrderedDict([ (100, 'sto'), (1000, 'tisoč'), (1000000, 'milijon'), (1e9, 'bilijon'), (1e12, 'trilijon'), (1e15, 'kvadrilijon'), (1e18, 'kvintilijon'), (1e21, 'sekstilijon'), (1e24, 'septilijon'), (1e27, 'oktilijon'), (1e30, 'nonilijon'), (1e33, 'decilijon') # TODO > 1e33 ]) _ORDINAL_BASE_SL = { 1: 'prvi', 2: 'drugi', 3: 'tretji', 4: 'četrti', 5: 'peti', 6: 'šesti', 7: 'sedmi', 8: 'osmi', 9: 'deveti', 10: 'deseti', 11: 'enajsti', 12: 'dvanajsti', 13: 'trinajsti', 14: 'štirinajsti', 15: 'petnajsti', 16: 'šestnajsti', 17: 'sedemnajsti', 18: 'osemnajsti', 19: 'devetnajsti', 20: 'dvajseti', 30: 'trideseti', 40: 'štirideseti', 50: 'petdeseti', 60: 'šestdeseti', 70: 'sedemdeseti', 80: 'osemdeseti', 90: 'devetdeseti', 1e2: 'stoti', 1e3: 'tisoči' } _LONG_ORDINAL_SL = { 1e6: 'milijonti', 1e12: 'bilijonti', 1e18: 'trilijonti', 1e24: 'kvadrilijonti', 1e30: 'kvintiljonti', 1e36: 'sekstilijonti', 1e42: 'septilijonti', 1e48: 'oktilijonti', 1e54: 'nonilijonti', 1e60: 'decilijonti' # TODO > 1e60 } _LONG_ORDINAL_SL.update(_ORDINAL_BASE_SL) _SHORT_ORDINAL_SL = { 1e6: 'milijonti', 1e9: 'bilijonti', 1e12: 'trilijonti', 1e15: 'kvadrilijonti', 1e18: 'kvintiljonti', 1e21: 'sekstilijonti', 1e24: 'septilijonti', 1e27: 'oktilijonti', 1e30: 'nonilijonti', 1e33: 'decilijonti' # TODO > 1e33 } _SHORT_ORDINAL_SL.update(_ORDINAL_BASE_SL) lingua-franca-release-v0.4.3/lingua_franca/lang/common_data_sv.py000066400000000000000000000024621426211343400251120ustar00rootroot00000000000000_FUNCTION_NOT_IMPLEMENTED_WARNING = "Denna funktion har inte implementerats i 'sv'" _MONTHS_SV = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', 'juli', 'augusti', 'september', 'oktober', 'november', 'december'] _NUM_STRING_SV = { 0: 'noll', 1: 'en', 2: 'två', 3: 'tre', 4: 'fyra', 5: 'fem', 6: 'sex', 7: 'sju', 8: 'åtta', 9: 'nio', 10: 'tio', 11: 'elva', 12: 'tolv', 13: 'tretton', 14: 'fjorton', 15: 'femton', 16: 'sexton', 17: 'sjutton', 18: 'arton', 19: 'nitton', 20: 'tjugo', 30: 'trettio', 40: 'fyrtio', 50: 'femtio', 60: 'sextio', 70: 'sjuttio', 80: 'åttio', 90: 'nittio', 100: 'hundra' } _NUM_POWERS_OF_TEN_SV = [ 'hundra', 'tusen', 'miljon', 'miljard', 'biljon', 'biljard', 'triljon', 'triljard' ] _FRACTION_STRING_SV = { 2: 'halv', 3: 'tredjedel', 4: 'fjärdedel', 5: 'femtedel', 6: 'sjättedel', 7: 'sjundedel', 8: 'åttondel', 9: 'niondel', 10: 'tiondel', 11: 'elftedel', 12: 'tolftedel', 13: 'trettondel', 14: 'fjortondel', 15: 'femtondel', 16: 'sextondel', 17: 'sjuttondel', 18: 'artondel', 19: 'nittondel', 20: 'tjugondel' } _EXTRA_SPACE_SV = " " lingua-franca-release-v0.4.3/lingua_franca/lang/format_ca.py000066400000000000000000000465721426211343400240660ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_ca import _FRACTION_STRING_CA, \ _NUM_STRING_CA from lingua_franca.internal import lookup_variant from enum import IntEnum class TimeVariantCA(IntEnum): DEFAULT = 0 BELL = 1 FULL_BELL = 2 SPANISH_LIKE = 3 def nice_number_ca(number, speech, denominators=range(1, 21)): """ Catalan helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 i mig" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) # denominador den_str = _FRACTION_STRING_CA[den] # fraccions if whole == 0: if num == 1: # un desè return_string = 'un {}'.format(den_str) else: # tres mig return_string = '{} {}'.format(num, den_str) # inteiros >10 elif num == 1: # trenta-un return_string = '{}-{}'.format(whole, den_str) # inteiros >10 com fracções else: # vint i 3 desens return_string = '{} i {} {}'.format(whole, num, den_str) # plural if num > 1: return_string += 's' return return_string def pronounce_number_ca(number, places=2): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinc coma dos' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak Returns: (str): The pronounced number """ if abs(number) >= 100: # TODO: Support n > 100 return str(number) result = "" if number < 0: result = "menys " number = abs(number) if number >= 20: tens = int(number - int(number) % 10) ones = int(number - tens) result += _NUM_STRING_CA[tens] if ones > 0: if tens == 20: result += "-i-" + _NUM_STRING_CA[ones] else: result += "-" + _NUM_STRING_CA[ones] else: result += _NUM_STRING_CA[int(number)] # Deal with decimal part, in Catalan is commonly used the comma # instead the dot. Decimal part can be written both with comma # and dot, but when pronounced, its pronounced "coma" if not number == int(number) and places > 0: if abs(number) < 1.0 and (result == "menys " or not result): result += "zero" result += " coma" _num_str = str(number) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + _NUM_STRING_CA[int(char)] return result @lookup_variant({ "default": TimeVariantCA.DEFAULT, "traditional": TimeVariantCA.FULL_BELL, "bell": TimeVariantCA.BELL, "full_bell": TimeVariantCA.FULL_BELL, "spanish": TimeVariantCA.SPANISH_LIKE }) def nice_time_ca(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): """ Format a time to a comfortable human format For example, generate 'cinc trenta' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ variant = variant or TimeVariantCA.DEFAULT if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if variant == TimeVariantCA.BELL: # Bell Catalan Time System # https://en.wikipedia.org/wiki/Catalan_time_system if dt.minute < 7: next_hour = False elif dt.minute == 7 or dt.minute == 8: speak += "mig quart" next_hour = True elif dt.minute < 15: next_hour = False elif dt.minute == 15: speak += "un quart" next_hour = True elif dt.minute == 16: speak += "un quart i un minut" next_hour = True elif dt.minute < 21: speak += "un quart i " + pronounce_number_ca( dt.minute - 15) + " minuts" next_hour = True elif dt.minute == 22 or dt.minute == 23: speak += "un quart i mig" next_hour = True elif dt.minute < 30: speak += "un quart i " + pronounce_number_ca( dt.minute - 15) + " minuts" next_hour = True elif dt.minute == 30: speak += "dos quarts" next_hour = True elif dt.minute == 31: speak += "dos quarts i un minut" next_hour = True elif dt.minute < 37: speak += "dos quarts i " + pronounce_number_ca( dt.minute - 30) + " minuts" next_hour = True elif dt.minute == 37 or dt.minute == 38: speak += "dos quarts i mig" next_hour = True elif dt.minute < 45: speak += "dos quarts i " + pronounce_number_ca( dt.minute - 30) + " minuts" next_hour = True elif dt.minute == 45: speak += "tres quarts" next_hour = True elif dt.minute == 46: speak += "tres quarts i un minut" next_hour = True elif dt.minute < 52: speak += "tres quarts i " + pronounce_number_ca( dt.minute - 45) + " minuts" next_hour = True elif dt.minute == 52 or dt.minute == 53: speak += "tres quarts i mig" next_hour = True elif dt.minute > 53: speak += "tres quarts i " + pronounce_number_ca( dt.minute - 45) + " minuts" next_hour = True if next_hour == True: next_hour = (dt.hour + 1) % 12 if next_hour == 0: speak += " de dotze" if dt.hour == 11: speak += " del migdia" else: speak += " de la nit" elif next_hour == 1: speak += " d'una" if dt.hour == 12: speak += " de la tarda" else: speak += " de la matinada" elif next_hour == 2: speak += "de dues" if dt.hour == 13: speak += " de la tarda" else: speak += " de la nit" elif next_hour == 11: speak += "d'onze" if dt.hour == 22: speak += " de la nit" else: speak += " del matí" else: speak += "de " + pronounce_number_ca(next_hour) if dt.hour == 0 and dt.hour < 5: speak += " de la matinada" elif dt.hour >= 5 and dt.hour < 11: speak += " del matí" elif dt.hour == 11: speak += " del migdia" elif dt.hour >= 12 and dt.hour <= 17: speak += " de la tarda" elif dt.hour >= 18 and dt.hour < 20: speak += " del vespre" elif dt.hour >= 21 and dt.hour <= 23: speak += " de la nit" else: hour = dt.hour % 12 if hour == 0: speak += "les dotze" elif hour == 1: speak += "la una" elif hour == 2: speak += "les dues" else: speak += "les " + pronounce_number_ca(hour) if dt.minute == 0: speak += " en punt" elif dt.minute == 1: speak += " i un minut" else: speak += " i " + pronounce_number_ca(dt.minute) + " minuts" if dt.hour == 0: speak += " de la nit" elif dt.hour >= 1 and dt.hour < 6: speak += " de la matinada" elif dt.hour >= 6 and dt.hour < 11: speak += " del matí" elif dt.hour == 12: speak += " del migdia" elif dt.hour >= 13 and dt.hour < 19: speak += " de la tarda" elif dt.hour >= 19 and dt.hour < 21: speak += " del vespre" elif dt.hour >= 21 and dt.hour <= 23: speak += " de la nit" elif variant == TimeVariantCA.FULL_BELL: # Full Bell Catalan Time System # https://en.wikipedia.org/wiki/Catalan_time_system if dt.minute < 2: # en punt next_hour = False if dt.minute < 5: # tocades next_hour = False elif dt.minute < 7: # ben tocades next_hour = False elif dt.minute < 9: # mig quart speak += "mig quart" next_hour = True elif dt.minute < 12: # mig quart passat speak += "mig quart passat" next_hour = True elif dt.minute < 14: # mig quart passat speak += "mig quart ben passat" next_hour = True elif dt.minute < 17: speak += "un quart" next_hour = True elif dt.minute < 20: speak += "un quart tocat" next_hour = True elif dt.minute < 22: speak += "un quart ben tocat" next_hour = True elif dt.minute < 24: speak += "un quart i mig" next_hour = True elif dt.minute < 27: speak += "un quart i mig passat" next_hour = True elif dt.minute < 29: speak += "un quart i mig ben passat" next_hour = True elif dt.minute < 32: speak += "dos quarts" next_hour = True elif dt.minute < 35: speak += "dos quarts tocats" next_hour = True elif dt.minute < 37: speak += "dos quarts ben tocats" next_hour = True elif dt.minute < 39: speak += "dos quarts i mig" next_hour = True elif dt.minute < 42: speak += "dos quarts i mig passats" next_hour = True elif dt.minute < 44: speak += "dos quarts i mig ben passats" next_hour = True elif dt.minute < 47: speak += "tres quarts" next_hour = True elif dt.minute < 50: speak += "tres quarts tocats" next_hour = True elif dt.minute < 52: speak += "tres quarts ben tocats" next_hour = True elif dt.minute < 54: speak += "tres quarts i mig" next_hour = True elif dt.minute < 57: speak += "tres quarts i mig passats" next_hour = True elif dt.minute < 59: speak += "tres quarts i mig ben passats" next_hour = True elif dt.minute == 59: next_hour = False if next_hour == True: next_hour = (dt.hour + 1) % 12 if next_hour == 0: speak += " de dotze" if dt.hour == 11: speak += " del migdia" else: speak += " de la nit" elif next_hour == 1: speak += " d'una" if dt.hour == 12: speak += " de la tarda" else: speak += " de la matinada" elif next_hour == 2: speak += "de dues" if dt.hour == 13: speak += " de la tarda" else: speak += " de la nit" elif next_hour == 11: speak += "d'onze" if dt.hour == 22: speak += " de la nit" else: speak += " del matí" else: speak += "de " + pronounce_number_ca(next_hour) if dt.hour == 0 and dt.hour < 5: speak += " de la matinada" elif dt.hour >= 5 and dt.hour < 11: speak += " del matí" elif dt.hour == 11: speak += " del migdia" elif dt.hour >= 12 and dt.hour <= 17: speak += " de la tarda" elif dt.hour >= 18 and dt.hour < 20: speak += " del vespre" elif dt.hour >= 21 and dt.hour <= 23: speak += " de la nit" else: hour = dt.hour % 12 if dt.minute == 59: hour = (hour + 1) % 12 if hour == 0: speak += "les dotze" elif hour == 1: speak += "la una" elif hour == 2: speak += "les dues" else: speak += "les " + pronounce_number_ca(hour) if dt.minute == 0: speak += " en punt" elif dt.minute > 1 and dt.minute < 5: if hour == 1: speak += " tocada" else: speak += " tocades" elif dt.minute < 7: if hour == 1: speak += " ben tocada" else: speak += " ben tocades" if dt.hour == 0: if hour == 1: speak += " de la matinada" else: speak += " de la nit" elif dt.hour < 6: if hour == 6: speak += " del matí" else: speak += " de la matinada" elif dt.hour < 12: if hour == 12: speak += " del migdia" else: speak += " del matí" elif dt.hour == 12: if hour == 1: speak += " de la tarda" else: speak += " del migdia" elif dt.hour < 19: if hour == 7: speak += " del vespre" else: speak += " de la tarda" elif dt.hour < 21: if hour == 9: speak += " de la nit" else: speak += " del vespre" elif dt.hour <= 23: speak += " de la nit" elif variant == TimeVariantCA.SPANISH_LIKE: # Prepare for "tres menys quart" ?? if dt.minute == 35: minute = -25 hour = dt.hour + 1 elif dt.minute == 40: minute = -20 hour = dt.hour + 1 elif dt.minute == 45: minute = -15 hour = dt.hour + 1 elif dt.minute == 50: minute = -10 hour = dt.hour + 1 elif dt.minute == 55: minute = -5 hour = dt.hour + 1 else: minute = dt.minute hour = dt.hour if hour == 0 or hour == 12: speak += "les dotze" elif hour == 1 or hour == 13: speak += "la una" elif hour < 13: speak = "les " + pronounce_number_ca(hour) else: speak = "les " + pronounce_number_ca(hour - 12) if minute != 0: # les hores especials if minute == 15: speak += " i quart" elif minute == 30: speak += " i mitja" elif minute == -15: speak += " menys quart" else: # sis i nou. set i veint-i-cinc if minute > 0: speak += " i " + pronounce_number_ca(minute) else: # si son las set menys vint, no posem la "i" speak += " " + pronounce_number_ca(minute) # Default Watch Time Sytem else: if use_24hour: # simply speak the number if dt.hour == 1: speak += "la una" elif dt.hour == 2: speak += "les dues" elif dt.hour == 21: speak += "les vint-i-una" elif dt.hour == 22: speak += "les vint-i-dues" else: speak += "les " + pronounce_number_ca(dt.hour) if dt.minute > 0: speak += " i " + pronounce_number_ca(dt.minute) else: # speak number and add daytime identifier # (equivalent to "in the morning") if dt.hour == 0: speak += "les dotze" # 1 and 2 are pronounced in female form when talking about hours elif dt.hour == 1 or dt.hour == 13: speak += "la una" elif dt.hour == 2 or dt.hour == 14: speak += "les dues" elif dt.hour < 13: speak = "les " + pronounce_number_ca(dt.hour) else: speak = "les " + pronounce_number_ca(dt.hour - 12) # exact time if dt.minute == 0: # 3:00 speak += " en punt" # else else: speak += " i " + pronounce_number_ca(dt.minute) # TODO: review day-periods if use_ampm: if dt.hour == 0: speak += " de la nit" elif dt.hour >= 1 and dt.hour < 6: speak += " de la matinada" elif dt.hour >= 6 and dt.hour < 12: speak += " del matí" elif dt.hour == 12: speak += " del migdia" elif dt.hour >= 13 and dt.hour <= 18: speak += " de la tarda" elif dt.hour >= 19 and dt.hour < 21: speak += " del vespre" elif dt.hour != 0 and dt.hour != 12: speak += " de la nit" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_common.py000066400000000000000000000031351426211343400247570ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # def convert_to_mixed_fraction(number, denominators=range(1, 21)): """ Convert floats to components of a mixed fraction representation Returns the closest fractional representation using the provided denominators. For example, 4.500002 would become the whole number 4, the numerator 1 and the denominator 2 Args: number (float): number for convert denominators (iter of ints): denominators to use, default [1 .. 20] Returns: whole, numerator, denominator (int): Integers of the mixed fraction """ int_number = int(number) if int_number == number: return int_number, 0, 1 # whole number, no fraction frac_number = abs(number - int_number) if not denominators: denominators = range(1, 21) for denominator in denominators: numerator = abs(frac_number) * denominator if abs(numerator - round(numerator)) < 0.01: # 0.01 accuracy break else: return None return int_number, int(round(numerator)), denominator lingua-franca-release-v0.4.3/lingua_franca/lang/format_cs.py000066400000000000000000000336301426211343400240770ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_cs import _NUM_STRING_CS, \ _FRACTION_STRING_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, _LONG_ORDINAL_CS def nice_number_cs(number, speech=True, denominators=range(1, 21)): """ English helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 and a half" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_CS[den] if whole == 0: if num == 1: return_string = '{}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} a {}'.format(whole, den_str) else: return_string = '{} a {} {}'.format(whole, num, den_str) if num > 4: return_string = return_string[:-1] elif num > 1: return_string = return_string[:-1] + 'y' return return_string def pronounce_number_cs(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ num = number # deal with infinity if num == float("inf"): return "nekonečno" elif num == float("-inf"): return "záporné nekonečno" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: if ordinals: # This handles zápornés of powers separately from the normal # handling since each call disables the scientific flag return '{}{} krát deset k {}{} mocnině'.format( 'záporné ' if float(n) < 0 else '', pronounce_number_cs( abs(float(n)), places, short_scale, False, ordinals=False), 'záporné ' if power < 0 else '', pronounce_number_cs(abs(power), places, short_scale, False, ordinals=True)) else: # This handles zápornés of powers separately from the normal # handling since each call disables the scientific flag return '{}{} krát deset na mocninu {}{}'.format( 'záporné ' if float(n) < 0 else '', pronounce_number_cs( abs(float(n)), places, short_scale, False), 'záporné ' if power < 0 else '', pronounce_number_cs(abs(power), places, short_scale, False)) if short_scale: number_names = _NUM_STRING_CS.copy() number_names.update(_SHORT_SCALE_CS) else: number_names = _NUM_STRING_CS.copy() number_names.update(_LONG_SCALE_CS) digits = [number_names[n] for n in range(0, 20)] tens = [number_names[n] for n in range(10, 100, 10)] if short_scale: hundreds = [_SHORT_SCALE_CS[n] for n in _SHORT_SCALE_CS.keys()] else: hundreds = [_LONG_SCALE_CS[n] for n in _LONG_SCALE_CS.keys()] # deal with zápornés result = "" if num < 0: result = "záporné " if scientific else "mínus " num = abs(num) if not ordinals: try: # deal with 4 digits # usually if it's a 4 digit num it should be said like a date # i.e. 1972 => nineteen seventy two if len(str(num)) == 4 and isinstance(num, int): _num = str(num) # deal with 1000, 2000, 2001, 2100, 3123, etc # is skipped as the rest of the # functin deals with this already if _num[1:4] == '000' or _num[1:3] == '00' or int(_num[0:2]) >= 20: pass # deal with 1900, 1300, etc # i.e. 1900 => nineteen hundred elif _num[2:4] == '00': first = number_names[int(_num[0:2])] last = number_names[100] return first + " " + last # deal with 1960, 1961, etc # i.e. 1960 => nineteen sixty # 1961 => nineteen sixty one else: first = number_names[int(_num[0:2])] if _num[3:4] == '0': last = number_names[int(_num[2:4])] else: second = number_names[int(_num[2:3])*10] last = second + " " + number_names[int(_num[3:4])] return first + " " + last # exception used to catch any unforseen edge cases # will default back to normal subroutine except Exception as e: # TODO this probably shouldn't go to stdout print('ERROR: Exception in pronounce_number_cs: {}' + repr(e)) # check for a direct match if num in number_names and not ordinals: if num > 90: result += "jedna " result += number_names[num] else: def _sub_thousand(n, ordinals=False): assert 0 <= n <= 999 if n in _SHORT_ORDINAL_CS and ordinals: return _SHORT_ORDINAL_CS[n] if n <= 19: return digits[n] elif n <= 99: q, r = divmod(n, 10) return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r else "") else: q, r = divmod(n, 100) return digits[q] + " sto" + ( " a " + _sub_thousand(r, ordinals) if r else "") def _short_scale(n): if n >= max(_SHORT_SCALE_CS.keys()): return "nekonečno" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000)): if not z: continue number = _sub_thousand(z, not i and ordi) if i: if i >= len(hundreds): return "" number += " " if ordi: if i * 1000 in _SHORT_ORDINAL_CS: if z == 1: number = _SHORT_ORDINAL_CS[i * 1000] else: number += _SHORT_ORDINAL_CS[i * 1000] else: if n not in _SHORT_SCALE_CS: num = int("1" + "0" * (len(str(n)) - 2)) number += _SHORT_SCALE_CS[num] + "tý" else: number = _SHORT_SCALE_CS[n] + "tý" else: number += hundreds[i] res.append(number) ordi = False return ", ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res def _long_scale(n): if n >= max(_LONG_SCALE_CS.keys()): return "nekonečno" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000000)): if not z: continue number = pronounce_number_cs(z, places, True, scientific, ordinals=ordi and not i) # strip off the comma after the thousand if i: if i >= len(hundreds): return "" # plus one as we skip 'thousand' # (and 'hundred', but this is excluded by index value) number = number.replace(',', '') if ordi: if i * 1000000 in _LONG_ORDINAL_CS: if z == 1: number = _LONG_ORDINAL_CS[ (i + 1) * 1000000] else: number += _LONG_ORDINAL_CS[ (i + 1) * 1000000] else: if n not in _LONG_SCALE_CS: num = int("1" + "0" * (len(str(n)) - 2)) number += " " + _LONG_SCALE_CS[ num] + "tý" else: number = " " + _LONG_SCALE_CS[n] + "tý" else: number += " " + hundreds[i + 1] res.append(number) return ", ".join(reversed(res)) if short_scale: result += _short_scale(num) else: result += _long_scale(num) # deal with scientific notation unpronounceable as number if not result and "e" in str(num): return pronounce_number_cs(num, places, short_scale, scientific=True) # Deal with fractional part elif not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "mínus " or not result): result += "nula" result += " tečka" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_cs(dt, speech=True, use_24hour=True, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time if use_24hour: speak = "" # Either "0 8 hundred" or "13 hundred" if string[0] == '0': speak += pronounce_number_cs(int(string[0])) + " " speak += pronounce_number_cs(int(string[1])) else: speak = pronounce_number_cs(int(string[0:2])) speak += " " if string[3:5] == '00': speak += "sto" else: if string[3] == '0': speak += pronounce_number_cs(0) + " " speak += pronounce_number_cs(int(string[4])) else: speak += pronounce_number_cs(int(string[3:5])) return speak else: if dt.hour == 0 and dt.minute == 0: return "půlnoc" elif dt.hour == 12 and dt.minute == 0: return "poledne" hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if dt.minute == 15: speak = "čtvrt po " + pronounce_number_cs(hour) elif dt.minute == 30: speak = "půl po " + pronounce_number_cs(hour) elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 speak = "třičtvrtě na " + pronounce_number_cs(next_hour) else: speak = pronounce_number_cs(hour) if dt.minute == 0: if not use_ampm: return speak + " hodin" else: if dt.minute < 10: speak += " oh" speak += " " + pronounce_number_cs(dt.minute) if use_ampm: if dt.hour > 11: speak += " p.m." else: speak += " a.m." return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_da.py000066400000000000000000000266371426211343400240670ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_da import _EXTRA_SPACE_DA, \ _FRACTION_STRING_DA, _MONTHS_DA, _NUM_POWERS_OF_TEN, _NUM_STRING_DA from math import floor def nice_number_da(number, speech=True, denominators=range(1, 21)): """ Danish helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 einhalb" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)).replace(".", ",") whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_DA[den] if whole == 0: if num == 1: return_string = '{} {}'.format(num, den_str) else: return_string = '{} {}e'.format(num, den_str) else: if num == 1: return_string = '{} og {} {}'.format(whole, num, den_str) else: return_string = '{} og {} {}e'.format(whole, num, den_str) return return_string def pronounce_number_da(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # TODO short_scale, scientific and ordinals # currently ignored def pronounce_triplet_da(num): result = "" num = floor(num) if num > 99: hundreds = floor(num / 100) if hundreds > 0: if hundreds == 1: result += 'et' + 'hundrede' + _EXTRA_SPACE_DA else: result += _NUM_STRING_DA[hundreds] + \ 'hundrede' + _EXTRA_SPACE_DA num -= hundreds * 100 if num == 0: result += '' # do nothing elif num == 1: result += 'et' elif num <= 20: result += _NUM_STRING_DA[num] + _EXTRA_SPACE_DA elif num > 20: ones = num % 10 tens = num - ones if ones > 0: result += _NUM_STRING_DA[ones] + _EXTRA_SPACE_DA if tens > 0: result += 'og' + _EXTRA_SPACE_DA if tens > 0: result += _NUM_STRING_DA[tens] + _EXTRA_SPACE_DA return result def pronounce_fractional_da(num, places): # fixed number of places even with trailing zeros result = "" place = 10 while places > 0: # doesn't work with 1.0001 and places = 2: int( # number*place) % 10 > 0 and places > 0: result += " " + _NUM_STRING_DA[int(num * place) % 10] place *= 10 places -= 1 return result def pronounce_whole_number_da(num, scale_level=0): if num == 0: return '' num = floor(num) result = '' last_triplet = num % 1000 if last_triplet == 1: if scale_level == 0: if result != '': result += '' + 'et' else: result += "en" elif scale_level == 1: result += 'et' + _EXTRA_SPACE_DA + 'tusinde' + _EXTRA_SPACE_DA else: result += "en " + _NUM_POWERS_OF_TEN[scale_level] + ' ' elif last_triplet > 1: result += pronounce_triplet_da(last_triplet) if scale_level == 1: result += 'tusinde' + _EXTRA_SPACE_DA if scale_level >= 2: result += "og" + _NUM_POWERS_OF_TEN[scale_level] if scale_level >= 2: if scale_level % 2 == 0: result += "er" # MillionER result += "er " # MilliardER, MillioneER num = floor(num / 1000) scale_level += 1 return pronounce_whole_number_da(num, scale_level) + result + _EXTRA_SPACE_DA result = "" if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: return str(_NUM_STRING_DA[0]) elif number < 0: return "minus " + pronounce_number_da(abs(number), places) else: if number == int(number): return pronounce_whole_number_da(number) else: whole_number_part = floor(number) fractional_part = number - whole_number_part result += pronounce_whole_number_da(whole_number_part) if places > 0: result += " komma" result += pronounce_fractional_da(fractional_part, places) return result def pronounce_ordinal_da(number): """ This function pronounces a number as an ordinal 1 -> first 2 -> second Args: number (int): the number to format Returns: (str): The pronounced number string. """ # ordinals for 1, 3, 7 and 8 are irregular # this produces the base form, it will have to be adapted for genus, # casus, numerus ordinals = ["nulte", "første", "anden", "tredie", "fjerde", "femte", "sjette", "syvende", "ottende", "niende", "tiende"] # only for whole positive numbers including zero if number < 0 or number != int(number): return number if number < 10: return ordinals[number] if number < 30: if pronounce_number_da(number)[-1:] == 'e': return pronounce_number_da(number) + "nde" else: return pronounce_number_da(number) + "ende" if number < 40: return pronounce_number_da(number) + "fte" else: if pronounce_number_da(number)[-1:] == 'e': return pronounce_number_da(number) + "nde" else: return pronounce_number_da(number) + "ende" def nice_time_da(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: if dt.hour == 1: speak += "et" # 01:00 is "et" not "en" else: speak += pronounce_number_da(dt.hour) if not dt.minute == 0: if dt.minute < 10: speak += ' nul' speak += " " + pronounce_number_da(dt.minute) return speak # ampm is ignored when use_24hour is true else: if dt.hour == 0 and dt.minute == 0: return "midnat" if dt.hour == 12 and dt.minute == 0: return "middag" # TODO: "half past 3", "a quarter of 4" and other idiomatic times if dt.hour == 0: speak += pronounce_number_da(12) elif dt.hour <= 13: if dt.hour == 1 or dt.hour == 13: # 01:00 and 13:00 is "et" speak += 'et' else: speak += pronounce_number_da(dt.hour) else: speak += pronounce_number_da(dt.hour - 12) if not dt.minute == 0: if dt.minute < 10: speak += ' nul' speak += " " + pronounce_number_da(dt.minute) if use_ampm: if dt.hour > 11: if dt.hour < 18: # 12:01 - 17:59 nachmittags/afternoon speak += " om eftermiddagen" elif dt.hour < 22: # 18:00 - 21:59 abends/evening speak += " om aftenen" else: # 22:00 - 23:59 nachts/at night speak += " om natten" elif dt.hour < 3: # 00:01 - 02:59 nachts/at night speak += " om natten" else: # 03:00 - 11:59 morgens/in the morning speak += " om morgenen" return speak def nice_response_da(text): # check for months and call _nice_ordinal_da declension of ordinals # replace "^" with "hoch" (to the power of) words = text.split() for idx, word in enumerate(words): if word.lower() in _MONTHS_DA: text = _nice_ordinal_da(text) if word == '^': wordNext = words[idx + 1] if idx + 1 < len(words) else "" if wordNext.isnumeric(): words[idx] = "opløftet i" text = " ".join(words) return text def _nice_ordinal_da(text, speech=True): # check for months for declension of ordinals before months # depending on articles/prepositions normalized_text = text words = text.split() for idx, word in enumerate(words): wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordPrev = words[idx - 1] if idx > 0 else "" if word[-1:] == ".": if word[:-1].isdecimal(): if wordNext.lower() in _MONTHS_DA: word = pronounce_ordinal_da(int(word[:-1])) if wordPrev.lower() in ["om", "den", "fra", "til", "(fra", "(om", "til"]: word += "n" elif wordPrev.lower() not in ["den"]: word += "r" words[idx] = word normalized_text = " ".join(words) return normalized_text lingua-franca-release-v0.4.3/lingua_franca/lang/format_de.py000066400000000000000000000271001426211343400240550ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_de import _EXTRA_SPACE_DE, \ _FRACTION_STRING_DE, _MONTHS_DE, _NUM_POWERS_OF_TEN_DE, _NUM_STRING_DE from math import floor def nice_number_de(number, speech=True, denominators=range(1, 21)): """ German helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 einhalb" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)).replace(".", ",") whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_DE[den] if whole == 0: if num == 1: return_string = 'ein {}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} und ein {}'.format(whole, den_str) else: return_string = '{} und {} {}'.format(whole, num, den_str) return return_string def pronounce_number_de(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # TODO short_scale, scientific and ordinals # currently ignored def pronounce_triplet_de(num): result = "" num = floor(num) if num > 99: hundreds = floor(num / 100) if hundreds > 0: result += _NUM_STRING_DE[ hundreds] + _EXTRA_SPACE_DE + 'hundert' + _EXTRA_SPACE_DE num -= hundreds * 100 if num == 0: result += '' # do nothing elif num == 1: result += 'eins' # need the s for the last digit elif num <= 20: result += _NUM_STRING_DE[num] # + _EXTRA_SPACE_DA elif num > 20: ones = num % 10 tens = num - ones if ones > 0: result += _NUM_STRING_DE[ones] + _EXTRA_SPACE_DE if tens > 0: result += 'und' + _EXTRA_SPACE_DE if tens > 0: result += _NUM_STRING_DE[tens] + _EXTRA_SPACE_DE return result def pronounce_fractional_de(num, places): # fixed number of places even with # trailing zeros result = "" place = 10 while places > 0: # doesn't work with 1.0001 and places = 2: int( # number*place) % 10 > 0 and places > 0: result += " " + _NUM_STRING_DE[int(num * place) % 10] if int(num * place) % 10 == 1: result += 's' # "1" is pronounced "eins" after the decimal # point place *= 10 places -= 1 return result def pronounce_whole_number_de(num, scale_level=0): if num == 0: return '' num = floor(num) result = '' last_triplet = num % 1000 if last_triplet == 1: if scale_level == 0: if result != '': result += '' + 'eins' else: result += "eins" elif scale_level == 1: result += 'ein' + _EXTRA_SPACE_DE + 'tausend' + _EXTRA_SPACE_DE else: result += "eine " + _NUM_POWERS_OF_TEN_DE[scale_level] + ' ' elif last_triplet > 1: result += pronounce_triplet_de(last_triplet) if scale_level == 1: # result += _EXTRA_SPACE_DA result += 'tausend' + _EXTRA_SPACE_DE if scale_level >= 2: # if _EXTRA_SPACE_DA == '': # result += " " result += " " + _NUM_POWERS_OF_TEN_DE[scale_level] if scale_level >= 2: if scale_level % 2 == 0: result += "e" # MillionE result += "n " # MilliardeN, MillioneN num = floor(num / 1000) scale_level += 1 return pronounce_whole_number_de(num, scale_level) + result # + _EXTRA_SPACE_DA result = "" if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: return str(_NUM_STRING_DE[0]) elif number < 0: return "minus " + pronounce_number_de(abs(number), places) else: if number == int(number): return pronounce_whole_number_de(number) else: whole_number_part = floor(number) fractional_part = number - whole_number_part result += pronounce_whole_number_de(whole_number_part) if places > 0: result += " Komma" result += pronounce_fractional_de(fractional_part, places) return result def pronounce_ordinal_de(number): """ This function pronounces a number as an ordinal 1 -> first 2 -> second Args: number (int): the number to format Returns: (str): The pronounced number string. """ # ordinals for 1, 3, 7 and 8 are irregular # this produces the base form, it will have to be adapted for genus, # casus, numerus ordinals = ["nullte", "erste", "zweite", "dritte", "vierte", "fünfte", "sechste", "siebte", "achte"] # only for whole positive numbers including zero if number < 0 or number != int(number): return number elif number < 9: return ordinals[number] elif number < 20: return pronounce_number_de(number) + "te" else: return pronounce_number_de(number) + "ste" def nice_time_de(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if not speech: if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros return string # Generate a speakable version of the time speak = "" if use_24hour: if dt.hour == 1: speak += "ein" # 01:00 is "ein Uhr" not "eins Uhr" else: speak += pronounce_number_de(dt.hour) speak += " Uhr" if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is # "13 Uhr" not "13 hundred hours" speak += " " + pronounce_number_de(dt.minute) return speak # ampm is ignored when use_24hour is true else: if dt.hour == 0 and dt.minute == 0: return "Mitternacht" elif dt.hour == 12 and dt.minute == 0: return "Mittag" elif dt.minute == 15: # sentence relative to next hour and 0 spoken as 12 next_hour = (dt.hour + 1) % 12 or 12 speak = "viertel " + pronounce_number_de(next_hour) elif dt.minute == 30: next_hour = (dt.hour + 1) % 12 or 12 speak = "halb " + pronounce_number_de(next_hour) elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 speak = "dreiviertel " + pronounce_number_de(next_hour) else: hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if hour == 1: # 01:00 and 13:00 is "ein Uhr" not "eins Uhr" speak += 'ein' else: speak += pronounce_number_de(hour) speak += " Uhr" if not dt.minute == 0: speak += " " + pronounce_number_de(dt.minute) if use_ampm: if 3 <= dt.hour < 12: speak += " morgens" # 03:00 - 11:59 morgens/in the morning elif 12 <= dt.hour < 18: speak += " nachmittags" # 12:01 - 17:59 nachmittags/afternoon elif 18 <= dt.hour < 22: speak += " abends" # 18:00 - 21:59 abends/evening else: speak += " nachts" # 22:00 - 02:59 nachts/at night return speak def nice_response_de(text): # check for months and call _nice_ordinal_de declension of ordinals # replace "^" with "hoch" (to the power of) words = text.split() for idx, word in enumerate(words): if word.lower() in _MONTHS_DE: text = _nice_ordinal_de(text) if word == '^': wordNext = words[idx + 1] if idx + 1 < len(words) else "" if wordNext.isnumeric(): words[idx] = "hoch" text = " ".join(words) return text def _nice_ordinal_de(text, speech=True): # check for months for declension of ordinals before months # depending on articles/prepositions normalized_text = text words = text.split() for idx, word in enumerate(words): wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordPrev = words[idx - 1] if idx > 0 else "" if word[-1:] == ".": if word[:-1].isdecimal(): if wordNext.lower() in _MONTHS_DE: word = pronounce_ordinal_de(int(word[:-1])) if wordPrev.lower() in ["am", "dem", "vom", "zum", "(vom", "(am", "zum"]: word += "n" elif wordPrev.lower() not in ["der", "die", "das"]: word += "r" words[idx] = word normalized_text = " ".join(words) return normalized_text lingua-franca-release-v0.4.3/lingua_franca/lang/format_en.py000066400000000000000000000335021426211343400240720ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \ _FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, _LONG_ORDINAL_EN def nice_number_en(number, speech=True, denominators=range(1, 21)): """ English helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 and a half" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_EN[den] if whole == 0: if num == 1: return_string = 'a {}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} and a {}'.format(whole, den_str) else: return_string = '{} and {} {}'.format(whole, num, den_str) if num > 1: return_string += 's' return return_string def pronounce_number_en(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ num = number # deal with infinity if num == float("inf"): return "infinity" elif num == float("-inf"): return "negative infinity" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: if ordinals: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} times ten to the {}{} power'.format( 'negative ' if float(n) < 0 else '', pronounce_number_en( abs(float(n)), places, short_scale, False, ordinals=False), 'negative ' if power < 0 else '', pronounce_number_en(abs(power), places, short_scale, False, ordinals=True)) else: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} times ten to the power of {}{}'.format( 'negative ' if float(n) < 0 else '', pronounce_number_en( abs(float(n)), places, short_scale, False), 'negative ' if power < 0 else '', pronounce_number_en(abs(power), places, short_scale, False)) if short_scale: number_names = _NUM_STRING_EN.copy() number_names.update(_SHORT_SCALE_EN) else: number_names = _NUM_STRING_EN.copy() number_names.update(_LONG_SCALE_EN) digits = [number_names[n] for n in range(0, 20)] tens = [number_names[n] for n in range(10, 100, 10)] if short_scale: hundreds = [_SHORT_SCALE_EN[n] for n in _SHORT_SCALE_EN.keys()] else: hundreds = [_LONG_SCALE_EN[n] for n in _LONG_SCALE_EN.keys()] # deal with negatives result = "" if num < 0: result = "negative " if scientific else "minus " num = abs(num) if not ordinals: try: # deal with 4 digits # usually if it's a 4 digit num it should be said like a date # i.e. 1972 => nineteen seventy two if len(str(num)) == 4 and isinstance(num, int): _num = str(num) # deal with 1000, 2000, 2001, 2100, 3123, etc # is skipped as the rest of the # functin deals with this already if _num[1:4] == '000' or _num[1:3] == '00' or int(_num[0:2]) >= 20: pass # deal with 1900, 1300, etc # i.e. 1900 => nineteen hundred elif _num[2:4] == '00': first = number_names[int(_num[0:2])] last = number_names[100] return first + " " + last # deal with 1960, 1961, etc # i.e. 1960 => nineteen sixty # 1961 => nineteen sixty one else: first = number_names[int(_num[0:2])] if _num[3:4] == '0': last = number_names[int(_num[2:4])] else: second = number_names[int(_num[2:3])*10] last = second + " " + number_names[int(_num[3:4])] return first + " " + last # exception used to catch any unforseen edge cases # will default back to normal subroutine except Exception as e: # TODO this probably shouldn't go to stdout print('ERROR: Exception in pronounce_number_en: {}' + repr(e)) # check for a direct match if num in number_names and not ordinals: if num > 90: result += "one " result += number_names[num] else: def _sub_thousand(n, ordinals=False): assert 0 <= n <= 999 if n in _SHORT_ORDINAL_EN and ordinals: return _SHORT_ORDINAL_EN[n] if n <= 19: return digits[n] elif n <= 99: q, r = divmod(n, 10) return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r else "") else: q, r = divmod(n, 100) return digits[q] + " hundred" + ( " and " + _sub_thousand(r, ordinals) if r else "") def _short_scale(n): if n >= max(_SHORT_SCALE_EN.keys()): return "infinity" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000)): if not z: continue number = _sub_thousand(z, not i and ordi) if i: if i >= len(hundreds): return "" number += " " if ordi: if i * 1000 in _SHORT_ORDINAL_EN: if z == 1: number = _SHORT_ORDINAL_EN[i * 1000] else: number += _SHORT_ORDINAL_EN[i * 1000] else: if n not in _SHORT_SCALE_EN: num = int("1" + "0" * (len(str(n)) - 2)) number += _SHORT_SCALE_EN[num] + "th" else: number = _SHORT_SCALE_EN[n] + "th" else: number += hundreds[i] res.append(number) ordi = False return ", ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res def _long_scale(n): if n >= max(_LONG_SCALE_EN.keys()): return "infinity" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000000)): if not z: continue number = pronounce_number_en(z, places, True, scientific, ordinals=ordi and not i) # strip off the comma after the thousand if i: if i >= len(hundreds): return "" # plus one as we skip 'thousand' # (and 'hundred', but this is excluded by index value) number = number.replace(',', '') if ordi: if i * 1000000 in _LONG_ORDINAL_EN: if z == 1: number = _LONG_ORDINAL_EN[ (i + 1) * 1000000] else: number += _LONG_ORDINAL_EN[ (i + 1) * 1000000] else: if n not in _LONG_SCALE_EN: num = int("1" + "0" * (len(str(n)) - 2)) number += " " + _LONG_SCALE_EN[ num] + "th" else: number = " " + _LONG_SCALE_EN[n] + "th" else: number += " " + hundreds[i + 1] res.append(number) return ", ".join(reversed(res)) if short_scale: result += _short_scale(num) else: result += _long_scale(num) # deal with scientific notation unpronounceable as number if not result and "e" in str(num): return pronounce_number_en(num, places, short_scale, scientific=True) # Deal with fractional part elif not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "minus " or not result): result += "zero" result += " point" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_en(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time if use_24hour: speak = "" # Either "0 8 hundred" or "13 hundred" if string[0] == '0': speak += pronounce_number_en(int(string[0])) + " " speak += pronounce_number_en(int(string[1])) else: speak = pronounce_number_en(int(string[0:2])) speak += " " if string[3:5] == '00': speak += "hundred" else: if string[3] == '0': speak += pronounce_number_en(0) + " " speak += pronounce_number_en(int(string[4])) else: speak += pronounce_number_en(int(string[3:5])) return speak else: if dt.hour == 0 and dt.minute == 0: return "midnight" elif dt.hour == 12 and dt.minute == 0: return "noon" hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if dt.minute == 15: speak = "quarter past " + pronounce_number_en(hour) elif dt.minute == 30: speak = "half past " + pronounce_number_en(hour) elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 speak = "quarter to " + pronounce_number_en(next_hour) else: speak = pronounce_number_en(hour) if dt.minute == 0: if not use_ampm: return speak + " o'clock" else: if dt.minute < 10: speak += " oh" speak += " " + pronounce_number_en(dt.minute) if use_ampm: if dt.hour > 11: speak += " p.m." else: speak += " a.m." return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_es.py000066400000000000000000000220531426211343400240760ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Format functions for castillian (es-es) """ from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_es import _NUM_STRING_ES, \ _FRACTION_STRING_ES def nice_number_es(number, speech=True, denominators=range(1, 21)): """ Spanish helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 y medio" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ strNumber = "" whole = 0 num = 0 den = 0 result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number whole = round(number, 3) else: whole, num, den = result if not speech: if num == 0: strNumber = '{:,}'.format(whole) strNumber = strNumber.replace(",", " ") strNumber = strNumber.replace(".", ",") return strNumber else: return '{} {}/{}'.format(whole, num, den) else: if num == 0: # if the number is not a fraction, nothing to do strNumber = str(whole) strNumber = strNumber.replace(".", ",") return strNumber den_str = _FRACTION_STRING_ES[den] # if it is not an integer if whole == 0: # if there is no whole number if num == 1: # if numerator is 1, return "un medio", for example strNumber = 'un {}'.format(den_str) else: # else return "cuatro tercios", for example strNumber = '{} {}'.format(num, den_str) elif num == 1: # if there is a whole number and numerator is 1 if den == 2: # if denominator is 2, return "1 y medio", for example strNumber = '{} y {}'.format(whole, den_str) else: # else return "1 y 1 tercio", for example strNumber = '{} y 1 {}'.format(whole, den_str) else: # else return "2 y 3 cuarto", for example strNumber = '{} y {} {}'.format(whole, num, den_str) if num > 1 and den != 3: # if the numerator is greater than 1 and the denominator # is not 3 ("tercio"), add an s for plural strNumber += 's' return strNumber def pronounce_number_es(number, places=2): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinco coma dos' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak Returns: (str): The pronounced number """ if abs(number) >= 100: # TODO: Soporta a números por encima de 100 return str(number) result = "" if number < 0: result = "menos " number = abs(number) # del 21 al 29 tienen una pronunciación especial if 20 <= number <= 29: tens = int(number-int(number) % 10) ones = int(number - tens) result += _NUM_STRING_ES[tens] if ones > 0: result = result[:-1] # a veinte le quitamos la "e" final para construir los # números del 21 - 29. Pero primero tenemos en cuenta # las excepciones: 22, 23 y 26, que llevan tilde. if ones == 2: result += "idós" elif ones == 3: result += "itrés" elif ones == 6: result += "iséis" else: result += "i" + _NUM_STRING_ES[ones] elif number >= 30: # de 30 en adelante tens = int(number-int(number) % 10) ones = int(number - tens) result += _NUM_STRING_ES[tens] if ones > 0: result += " y " + _NUM_STRING_ES[ones] else: result += _NUM_STRING_ES[int(number)] # Deal with decimal part, in spanish is commonly used the comma # instead the dot. Decimal part can be written both with comma # and dot, but when pronounced, its pronounced "coma" if not number == int(number) and places > 0: if abs(number) < 1.0 and (result == "menos " or not result): result += "cero" result += " coma" _num_str = str(number) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + _NUM_STRING_ES[int(char)] return result def nice_time_es(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'cinco treinta' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: # Tenemos que tener en cuenta que cuando hablamos en formato # 24h, no hay que especificar ninguna precisión adicional # como "la noche", "la tarde" o "la mañana" # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 if dt.hour == 1: speak += "la una" else: speak += "las " + pronounce_number_es(dt.hour) # las 14:04 son "las catorce cero cuatro" if dt.minute < 10: speak += " cero " + pronounce_number_es(dt.minute) else: speak += " " + pronounce_number_es(dt.minute) else: # Prepare for "tres menos cuarto" ?? if dt.minute == 35: minute = -25 hour = dt.hour + 1 elif dt.minute == 40: minute = -20 hour = dt.hour + 1 elif dt.minute == 45: minute = -15 hour = dt.hour + 1 elif dt.minute == 50: minute = -10 hour = dt.hour + 1 elif dt.minute == 55: minute = -5 hour = dt.hour + 1 else: minute = dt.minute hour = dt.hour if hour == 0 or hour == 12: speak += "las doce" elif hour == 1 or hour == 13: speak += "la una" elif hour < 13: speak = "las " + pronounce_number_es(hour) else: speak = "las " + pronounce_number_es(hour-12) if minute != 0: # las horas especiales if minute == 15: speak += " y cuarto" elif minute == 30: speak += " y media" elif minute == -15: speak += " menos cuarto" else: # seis y nueve. siete y veinticinco if minute > 0: speak += " y " + pronounce_number_es(minute) else: # si son las siete menos veinte, no ponemos la "y" speak += " " + pronounce_number_es(minute) # si no especificamos de la tarde, noche, mañana, etc if minute == 0 and not use_ampm: # 3:00 speak += " en punto" if use_ampm: # "de la noche" es desde que anochece hasta medianoche # así que decir que es desde las 21h es algo subjetivo # en España a las 20h se dice "de la tarde" # en castellano, las 12h es de la mañana o mediodía # así que diremos "de la tarde" a partir de las 13h. # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 if hour >= 0 and hour < 6: speak += " de la madrugada" elif hour >= 6 and hour < 13: speak += " de la mañana" elif hour >= 13 and hour < 21: speak += " de la tarde" else: speak += " de la noche" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_eu.py000066400000000000000000000257771426211343400241200ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Format functions for Euskara (eu-eu) """ from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca import config from lingua_franca.time import to_local, now_local, to_utc HOUR_STRING_EU = { 1: 'ordubata', 2: 'ordubiak', 3: 'hirurak', 4: 'laurak', 5: 'bostak', 6: 'seirak', 7: 'zazpirak', 8: 'zortzirak', 9: 'bederatziak', 10: 'hamarrak', 11: 'hamaikak', 12: 'hamabiak' } NUM_STRING_EU = { 0: 'zero', 1: 'bat', 2: 'bi', 3: 'hiru', 4: 'lau', 5: 'bost', 6: 'sei', 7: 'zazpi', 8: 'zortzi', 9: 'bederatzi', 10: 'hamar', 11: 'hamaika', 12: 'hamabi', 13: 'hamahiru', 14: 'hamalau', 15: 'hamabost', 16: 'hamasei', 17: 'hamazazpi', 18: 'hemezortzi', 19: 'hemeretzi', 20: 'hogei', 30: 'hogeita hamar', 40: 'berrogei', 50: 'berrogeita hamar', 60: 'hirurogei', 70: 'hirurogehita hamar', 80: 'laurogei', 90: 'laurogeita hamar', 100: 'ehun', 200: 'berrehun', 300: 'hirurehun', 400: 'laurehun', 500: 'bostehun', 600: 'seirehun', 700: 'zazpirehun', 800: 'zortzirehun', 900: 'bederatzirehun', 1000: 'mila' } FRACTION_STRING_EU = { 2: 'erdi', 3: 'heren', 4: 'laurden', 5: 'bosten', 6: 'seiren', 7: 'zazpiren', 8: 'zortziren', 9: 'bederatziren', 10: 'hamarren', 11: 'hamaikaren', 12: 'hamabiren', 13: 'hamahiruren', 14: 'hamalauren', 15: 'hamabosten', 16: 'hamaseiren', 17: 'hamazazpiren', 18: 'hemezortziren', 19: 'hemeretziren', 20: 'hogeiren' } def nice_number_eu(number, speech=True, denominators=range(1, 21)): """ Euskara helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 eta erdi" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ strNumber = "" whole = 0 num = 0 den = 0 result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number whole = round(number, 3) else: whole, num, den = result if not speech: if num == 0: strNumber = '{:,}'.format(whole) strNumber = strNumber.replace(",", " ") strNumber = strNumber.replace(".", ",") return strNumber else: return '{} {}/{}'.format(whole, num, den) else: if num == 0: # if the number is not a fraction, nothing to do strNumber = str(whole) strNumber = strNumber.replace(".", ",") return strNumber den_str = FRACTION_STRING_EU[den] # if it is not an integer if whole == 0: # if there is no whole number if num == 1: # if numerator is 1, return "un medio", for example strNumber = '{} bat'.format(den_str) else: # else return "cuatro tercios", for example strNumber = '{} {}'.format(num, den_str) elif num == 1: # if there is a whole number and numerator is 1 if den == 2: # if denominator is 2, return "1 y medio", for example strNumber = '{} eta {}'.format(whole, den_str) else: # else return "1 y 1 tercio", for example strNumber = '{} eta {} bat'.format(whole, den_str) else: # else return "2 y 3 cuarto", for example strNumber = '{} eta {} {}'.format(whole, num, den_str) return strNumber def pronounce_number_eu(num, places=2): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'bost koma bi' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak Returns: (str): The pronounced number """ if abs(num) >= 10000: # TODO: Soporta a números por encima de 1000 return str(num) result = "" if num < 0: result = "minus " num = abs(num) thousands = int(num-int(num) % 1000) _num = num - thousands hundreds = int(_num-int(_num) % 100) _num = _num - hundreds tens = int(_num-_num % 10) ones = int(_num - tens) if thousands > 0: if thousands > 1000: result += NUM_STRING_EU[int(thousands/1000)] + ' ' result += NUM_STRING_EU[1000] if hundreds > 0 and tens == 0 and ones == 0: result += ' eta ' elif hundreds > 0 or tens > 0 or ones > 0: result += ' ' if hundreds > 0: result += NUM_STRING_EU[hundreds] if tens > 0 or ones > 0: result += ' eta ' if tens or ones: if tens == 0 or tens == 10 or ones == 0: result += NUM_STRING_EU[int(_num)] else: if (tens % 20) == 10: ones = ones + 10 result += NUM_STRING_EU[int(tens)].split(' ')[0].replace("ta", "")+str("ta ") + NUM_STRING_EU[int(ones)] if abs(num) < 1.0: result+= NUM_STRING_EU[0] # Deal with decimal part, in basque is commonly used the comma # instead the dot. Decimal part can be written both with comma # and dot, but when pronounced, its pronounced "koma" if not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "minus " or not result): result += NUM_STRING_EU[0] result += " koma" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + NUM_STRING_EU[int(char)] return result def nice_time_eu(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'cinco treinta' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if config.inject_timezones: dt=to_utc(dt) if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: # Tenemos que tener en cuenta que cuando hablamos en formato # 24h, no hay que especificar ninguna precisión adicional # como "la noche", "la tarde" o "la mañana" # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 speak += pronounce_number_eu(dt.hour) + 'ak' # las 14:04 son "las catorce cero cuatro" if dt.minute < 10: speak += " zero " + pronounce_number_eu(dt.minute) else: speak += " " + pronounce_number_eu(dt.minute) else: minute = dt.minute hour = dt.hour _hour = hour if _hour == 0: _hour = 12 if _hour > 12: _hour -= 12 if (minute > 30): _hour += 1 speak = HOUR_STRING_EU[_hour] if minute != 0: if minute <= 30: if minute == 15: speak += " eta laurden" elif minute == 30: speak += " eta erdi" else: speak += " eta " + pronounce_number_eu(minute) else: if minute == 45: speak += " laurden gutxi" else: speak += " " + pronounce_number_eu(60 - minute) + " gutxi" # si no especificamos de la tarde, noche, mañana, etc if minute == 0 and not use_ampm: # 3:00 speak += " puntuan" if use_ampm: # "de la noche" es desde que anochece hasta medianoche # así que decir que es desde las 21h es algo subjetivo # en España a las 20h se dice "de la tarde" # en castellano, las 12h es de la mañana o mediodía # así que diremos "de la tarde" a partir de las 13h. # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 if hour >= 6 and hour < 13: speak = "goizeko " + speak elif hour >= 13 and hour < 20: speak = "arratsaldeko " + speak else: speak = "gaueko " + speak return speak # hemen dago tranpa # return str(dt.hour) + ":" + str(dt.minute) def nice_relative_time_eu(when, relative_to=None, lang=None): """Create a relative phrase to roughly describe a datetime Examples are "25 seconds", "tomorrow", "7 days". Args: when (datetime): Local timezone relative_to (datetime): Baseline for relative time, default is now() lang (str, optional): Defaults to "en-us". Returns: str: Relative description of the given time """ if relative_to: now = relative_to else: now = now_local() delta = to_local(when) - now if delta.total_seconds() < 1: return "0 segundo" if delta.total_seconds() < 90: if delta.total_seconds() == 1: return "segundo bat" else: return "{} segundo".format(int(delta.total_seconds())) minutes = int((delta.total_seconds() + 30) // 60) # +30 to round minutes if minutes < 90: if minutes == 1: return "minutu bat" else: return "{} minutu".format(minutes) hours = int((minutes + 30) // 60) # +30 to round hours if hours < 36: if hours == 1: return "ordu bat" else: return "{} ordu".format(hours) # TODO: "2 weeks", "3 months", "4 years", etc days = int((hours + 12) // 24) # +12 to round days if days == 1: return "egun bat" else: return "{} egun".format(days) lingua-franca-release-v0.4.3/lingua_franca/lang/format_fa.py000066400000000000000000000226031426211343400240560ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_fa import \ _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FARSI_BIG, _FARSI_SEPERATOR, \ _FARSI_FRAC, _FARSI_FRAC_BIG, _FRACTION_STRING_FA, _FORMAL_VARIANT import math from lingua_franca.internal import lookup_variant from enum import IntEnum from functools import wraps class NumberVariantFA(IntEnum): CONVERSATIONAL = 0 FORMAL = 1 lookup_number = lookup_variant({ "default": NumberVariantFA.CONVERSATIONAL, "conversational": NumberVariantFA.CONVERSATIONAL, "formal": NumberVariantFA.FORMAL, }) def _apply_number_variant(text, variant): if variant == NumberVariantFA.FORMAL: for key, value in _FORMAL_VARIANT.items(): text = text.replace(value, key) return text def _handle_number_variant(func): @wraps(func) @lookup_variant({ "default": NumberVariantFA.CONVERSATIONAL, "conversational": NumberVariantFA.CONVERSATIONAL, "formal": NumberVariantFA.FORMAL, }) def wrapper(*args, **kwargs): result = func(*args, **kwargs) if 'variant' in kwargs: return _apply_number_variant(result, kwargs['variant']) else: return result return wrapper @_handle_number_variant def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None): """ Farsi helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 and a half" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_FA[den] if whole == 0: if num == 1: return_string = 'یک {}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} و یک {}'.format(whole, den_str) else: return_string = '{} و {} {}'.format(whole, num, den_str) return return_string def _float2tuple(value, _precision): pre = int(value) post = abs(value - pre) * 10**_precision if abs(round(post) - post) < 0.01: # We generally floor all values beyond our precision (rather than # rounding), but in cases where we have something like 1.239999999, # which is probably due to python's handling of floats, we actually # want to consider it as 1.24 instead of 1.23 post = int(round(post)) else: post = int(math.floor(post)) while post != 0: x, y = divmod(post, 10) if y != 0: break post = x _precision -= 1 return pre, post, _precision def _cardinal3(number): if (number < 19): return _FARSI_ONES[number] if (number < 100): x, y = divmod(number, 10) if y == 0: return _FARSI_TENS[x] return _FARSI_TENS[x] + _FARSI_SEPERATOR + _FARSI_ONES[y] x, y = divmod(number, 100) if y == 0: return _FARSI_HUNDREDS[x] return _FARSI_HUNDREDS[x] + _FARSI_SEPERATOR + _cardinal3(y) def _cardinalPos(number): x = number res = '' for b in _FARSI_BIG: x, y = divmod(x, 1000) if (y == 0): continue yx = _cardinal3(y) if y == 1 and b == 'هزار': yx = b elif b != '': yx += ' ' + b if (res == ''): res = yx else: res = yx + _FARSI_SEPERATOR + res return res def _fractional(number, l): if (number / 10**l == 0.5): return "نیم" x = _cardinalPos(number) ld3, lm3 = divmod(l, 3) ltext = (_FARSI_FRAC[lm3] + " " + _FARSI_FRAC_BIG[ld3]).strip() + 'م' return x + " " + ltext def _to_ordinal(number): r = _to_cardinal(number, 0) if (r[-1] == 'ه' and r[-2] == 'س'): return r[:-1] + 'وم' return r + 'م' def _to_ordinal_num(value): return str(value)+"م" def _to_cardinal(number, places): if number < 0: return "منفی " + _to_cardinal(-number, places) if (number == 0): return "صفر" x, y, l = _float2tuple(number, places) if y == 0: return _cardinalPos(x) if x == 0: return _fractional(y, l) return _cardinalPos(x) + _FARSI_SEPERATOR + _fractional(y, l) @_handle_number_variant def pronounce_number_fa(number, places=2, scientific=False, ordinals=False, variant=None): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ num = number # deal with infinity if num == float("inf"): return "بینهایت" elif num == float("-inf"): return "منفی بینهایت" if scientific: if number == 0: return "صفر" number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: return '{}{} ضرب در ده به توان {}{}'.format( 'منفی ' if float(n) < 0 else '', pronounce_number_fa( abs(float(n)), places, False, ordinals=False), 'منفی ' if power < 0 else '', pronounce_number_fa(abs(power), places, False, ordinals=False)) if ordinals: return _to_ordinal(number) return _to_cardinal(number, places) @_handle_number_variant def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time if use_24hour: speak = "" # Either "0 8 hundred" or "13 hundred" if string[0] == '0': speak += pronounce_number_fa(int(string[1])) else: speak = pronounce_number_fa(int(string[0:2])) if not string[3:5] == '00': speak += " و " if string[3] == '0': speak += pronounce_number_fa(int(string[4])) else: speak += pronounce_number_fa(int(string[3:5])) speak += ' دقیقه' return speak else: if dt.hour == 0 and dt.minute == 0: return "نیمه شب" elif dt.hour == 12 and dt.minute == 0: return "ظهر" hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if dt.minute == 15: speak = pronounce_number_fa(hour) + " و ربع" elif dt.minute == 30: speak = pronounce_number_fa(hour) + " و نیم" elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 speak = "یه ربع به " + pronounce_number_fa(next_hour) else: speak = pronounce_number_fa(hour) if dt.minute == 0: if not use_ampm: return speak else: speak += " و " + pronounce_number_fa(dt.minute) + ' دقیقه' if use_ampm: if dt.hour > 11: speak += " بعد از ظهر" else: speak += " قبل از ظهر" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_fr.py000066400000000000000000000177331426211343400241070ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_fr import _NUM_STRING_FR, \ _FRACTION_STRING_FR def nice_number_fr(number, speech=True, denominators=range(1, 21)): """ French helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 et demi" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ strNumber = "" whole = 0 num = 0 den = 0 result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number whole = round(number, 3) else: whole, num, den = result if not speech: if num == 0: strNumber = '{:,}'.format(whole) strNumber = strNumber.replace(",", " ") strNumber = strNumber.replace(".", ",") return strNumber else: return '{} {}/{}'.format(whole, num, den) else: if num == 0: # if the number is not a fraction, nothing to do strNumber = str(whole) strNumber = strNumber.replace(".", ",") return strNumber den_str = _FRACTION_STRING_FR[den] # if it is not an integer if whole == 0: # if there is no whole number if num == 1: # if numerator is 1, return "un demi", for example strNumber = 'un {}'.format(den_str) else: # else return "quatre tiers", for example strNumber = '{} {}'.format(num, den_str) elif num == 1: # if there is a whole number and numerator is 1 if den == 2: # if denominator is 2, return "1 et demi", for example strNumber = '{} et {}'.format(whole, den_str) else: # else return "1 et 1 tiers", for example strNumber = '{} et 1 {}'.format(whole, den_str) else: # else return "2 et 3 quart", for example strNumber = '{} et {} {}'.format(whole, num, den_str) if num > 1 and den != 3: # if the numerator is greater than 1 and the denominator # is not 3 ("tiers"), add an s for plural strNumber += 's' return strNumber def pronounce_number_fr(number, places=2): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinq virgule deux' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak Returns: (str): The pronounced number """ if abs(number) >= 100: # TODO: Support for numbers over 100 return str(number) result = "" if number < 0: result = "moins " number = abs(number) if number > 16: tens = int(number-int(number) % 10) ones = int(number-tens) if ones != 0: if tens > 10 and tens <= 60 and int(number-tens) == 1: result += _NUM_STRING_FR[tens] + "-et-" + _NUM_STRING_FR[ones] elif number == 71: result += "soixante-et-onze" elif tens == 70: result += _NUM_STRING_FR[60] + "-" if ones < 7: result += _NUM_STRING_FR[10 + ones] else: result += _NUM_STRING_FR[10] + "-" + _NUM_STRING_FR[ones] elif tens == 90: result += _NUM_STRING_FR[80] + "-" if ones < 7: result += _NUM_STRING_FR[10 + ones] else: result += _NUM_STRING_FR[10] + "-" + _NUM_STRING_FR[ones] else: result += _NUM_STRING_FR[tens] + "-" + _NUM_STRING_FR[ones] else: if number == 80: result += "quatre-vingts" else: result += _NUM_STRING_FR[tens] else: result += _NUM_STRING_FR[int(number)] # Deal with decimal part if not number == int(number) and places > 0: if abs(number) < 1.0 and (result == "moins " or not result): result += "zéro" result += " virgule" _num_str = str(number) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + _NUM_STRING_FR[int(char)] return result def nice_time_fr(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'cinq heures trente' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: # "13 heures trente" if dt.hour == 0: speak += "minuit" elif dt.hour == 12: speak += "midi" elif dt.hour == 1: speak += "une heure" else: speak += pronounce_number_fr(dt.hour) + " heures" if dt.minute != 0: speak += " " + pronounce_number_fr(dt.minute) else: # Prepare for "trois heures moins le quart" if dt.minute == 35: minute = -25 hour = dt.hour + 1 elif dt.minute == 40: minute = -20 hour = dt.hour + 1 elif dt.minute == 45: minute = -15 hour = dt.hour + 1 elif dt.minute == 50: minute = -10 hour = dt.hour + 1 elif dt.minute == 55: minute = -5 hour = dt.hour + 1 else: minute = dt.minute hour = dt.hour if hour == 0: speak += "minuit" elif hour == 12: speak += "midi" elif hour == 1 or hour == 13: speak += "une heure" elif hour < 13: speak = pronounce_number_fr(hour) + " heures" else: speak = pronounce_number_fr(hour-12) + " heures" if minute != 0: if minute == 15: speak += " et quart" elif minute == 30: speak += " et demi" elif minute == -15: speak += " moins le quart" else: speak += " " + pronounce_number_fr(minute) if use_ampm: if hour > 17: speak += " du soir" elif hour > 12: speak += " de l'après-midi" elif hour > 0 and hour < 12: speak += " du matin" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_hu.py000066400000000000000000000252771426211343400241160ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_hu import _NUM_POWERS_OF_TEN, \ _EXTRA_SPACE_HU, _FRACTION_STRING_HU, _MONTHS_HU, _NUM_STRING_HU from math import floor def _get_vocal_type_hu(word): # checks the vocal attributes of a word vowels_high = len([char for char in word if char in 'eéiíöőüű']) vowels_low = len([char for char in word if char in 'aáoóuú']) if vowels_high != 0 and vowels_low != 0: return 2 # 2: type is mixed return 0 if vowels_high == 0 else 1 # 0: type is low, 1: is high def nice_number_hu(number, speech=True, denominators=range(1, 21)): """ Hungarian helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 és fél" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)).replace(".", ",") whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_HU[den] if whole == 0: if num == 1: one = 'egy ' if den != 2 else '' return_string = '{}{}'.format(one, den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: pointOne = 'egész egy' if den != 2 else 'és' return_string = '{} {} {}'.format(whole, pointOne, den_str) else: return_string = '{} egész {} {}'.format(whole, num, den_str) return return_string def pronounce_number_hu(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # TODO short_scale, scientific and ordinals # currently ignored def pronounce_triplet_hu(num): result = "" num = floor(num) if num > 99: hundreds = floor(num / 100) if hundreds > 0: hundredConst = _EXTRA_SPACE_HU + 'száz' + _EXTRA_SPACE_HU if hundreds == 1: result += hundredConst elif hundreds == 2: result += 'két' + hundredConst else: result += _NUM_STRING_HU[hundreds] + hundredConst num -= hundreds * 100 if num == 0: result += '' # do nothing elif num <= 20: result += _NUM_STRING_HU[num] # + _EXTRA_SPACE_DA elif num > 20: ones = num % 10 tens = num - ones if tens > 0: if tens != 20: result += _NUM_STRING_HU[tens] + _EXTRA_SPACE_HU else: result += "huszon" + _EXTRA_SPACE_HU if ones > 0: result += _NUM_STRING_HU[ones] + _EXTRA_SPACE_HU return result def pronounce_whole_number_hu(num, scale_level=0): if num == 0: return '' num = floor(num) result = '' last_triplet = num % 1000 if last_triplet == 1: if scale_level == 0: if result != '': result += '' + "egy" else: result += "egy" elif scale_level == 1: result += _EXTRA_SPACE_HU + \ _NUM_POWERS_OF_TEN[1] + _EXTRA_SPACE_HU else: result += "egy" + _NUM_POWERS_OF_TEN[scale_level] elif last_triplet > 1: result += pronounce_triplet_hu(last_triplet) if scale_level != 0: result = result.replace(_NUM_STRING_HU[2], 'két') if scale_level == 1: result += _NUM_POWERS_OF_TEN[1] + _EXTRA_SPACE_HU if scale_level >= 2: result += _NUM_POWERS_OF_TEN[scale_level] if scale_level > 0: result += '-' num = floor(num / 1000) scale_level += 1 return pronounce_whole_number_hu(num, scale_level) + result result = "" if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: return str(_NUM_STRING_HU[0]) elif number < 0: return "mínusz " + pronounce_number_hu(abs(number), places) else: if number == int(number): return pronounce_whole_number_hu(number).strip('-') else: whole_number_part = floor(number) fractional_part = number - whole_number_part if whole_number_part == 0: result += _NUM_STRING_HU[0] result += pronounce_whole_number_hu(whole_number_part) if places > 0: result += " egész " fraction = pronounce_whole_number_hu( round(fractional_part * 10 ** places)) result += fraction.replace(_NUM_STRING_HU[2], 'két') fraction_suffixes = [ 'tized', 'század', 'ezred', 'tízezred', 'százezred'] if places <= len(fraction_suffixes): result += ' ' + fraction_suffixes[places - 1] return result def pronounce_ordinal_hu(number): """ This function pronounces a number as an ordinal 1 -> first 2 -> second Args: number (int): the number to format Returns: (str): The pronounced number string. """ ordinals = ["nulladik", "első", "második", "harmadik", "negyedik", "ötödik", "hatodik", "hetedik", "nyolcadik", "kilencedik", "tizedik"] big_ordinals = ["", "ezredik", "milliomodik"] # only for whole positive numbers including zero if number < 0 or number != int(number): return number elif number < 11: return ordinals[number] else: # concatenate parts and inflect them accordingly root = pronounce_number_hu(number) vtype = _get_vocal_type_hu(root) last_digit = number - floor(number / 10) * 10 if root == "húsz": root = "husz" if number % 1000000 == 0: return root.replace(_NUM_POWERS_OF_TEN[2], big_ordinals[2]) if number % 1000 == 0: return root.replace(_NUM_POWERS_OF_TEN[1], big_ordinals[1]) if last_digit == 1: return root + "edik" elif root[-1] == 'ő': return root[:-1] + 'edik' elif last_digit != 0: return ordinals[last_digit].join( root.rsplit(_NUM_STRING_HU[last_digit], 1)) return root + "edik" if vtype == 1 else root + "adik" def nice_time_hu(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: speak += pronounce_number_hu(dt.hour) speak = speak.replace(_NUM_STRING_HU[2], 'két') speak += " óra" if not dt.minute == 0: # zero minutes are not pronounced speak += " " + pronounce_number_hu(dt.minute) return speak # ampm is ignored when use_24hour is true else: if dt.hour == 0 and dt.minute == 0: return "éjfél" if dt.hour == 12 and dt.minute == 0: return "dél" # TODO: "half past 3", "a quarter of 4" and other idiomatic times if dt.hour == 0: speak += pronounce_number_hu(12) elif dt.hour < 13: speak = pronounce_number_hu(dt.hour) else: speak = pronounce_number_hu(dt.hour - 12) speak = speak.replace(_NUM_STRING_HU[2], 'két') speak += " óra" if not dt.minute == 0: speak += " " + pronounce_number_hu(dt.minute) if use_ampm: if dt.hour > 11: if dt.hour < 18: speak = "délután " + speak # 12:01 - 17:59 elif dt.hour < 22: speak = "este " + speak # 18:00 - 21:59 este/evening else: speak = "éjjel " + speak # 22:00 - 23:59 éjjel/at night elif dt.hour < 3: speak = "éjjel " + speak # 00:01 - 02:59 éjjel/at night else: speak = "reggel " + speak # 03:00 - 11:59 reggel/in t. morning return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_it.py000066400000000000000000000254101426211343400241030ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_it import _NUM_STRING_IT, \ _FRACTION_STRING_IT, _LONG_SCALE_IT, _SHORT_SCALE_IT def nice_number_it(number, speech=True, denominators=range(1, 21)): """ Italian helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 e un mezz" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) # denominatore den_str = _FRACTION_STRING_IT[den] # frazione if whole == 0: if num == 1: # un decimo return_string = 'un {}'.format(den_str) else: # tre mezzi return_string = '{} {}'.format(num, den_str) # interi >10 elif num == 1: # trenta e un return_string = '{} e un {}'.format(whole, den_str) # interi >10 con frazioni else: # venti e 3 decimi return_string = '{} e {} {}'.format(whole, num, den_str) # gestisce il plurale del denominatore if num > 1: return_string += 'i' else: return_string += 'o' return return_string def pronounce_number_it(number, places=2, short_scale=False, scientific=False): """ Convert a number to it's spoken equivalent adapted to italian fron en version For example, '5.2' would return 'cinque virgola due' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation Returns: (str): The pronounced number """ num = number # gestione infinito if num == float("inf"): return "infinito" elif num == float("-inf"): return "meno infinito" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: return '{}{} per dieci elevato alla {}{}'.format( 'meno ' if float(n) < 0 else '', pronounce_number_it(abs(float(n)), places, short_scale, False), 'meno ' if power < 0 else '', pronounce_number_it(abs(power), places, short_scale, False)) if short_scale: number_names = _NUM_STRING_IT.copy() number_names.update(_SHORT_SCALE_IT) else: number_names = _NUM_STRING_IT.copy() number_names.update(_LONG_SCALE_IT) digits = [number_names[n] for n in range(0, 20)] tens = [number_names[n] for n in range(10, 100, 10)] if short_scale: hundreds = [_SHORT_SCALE_IT[n] for n in _SHORT_SCALE_IT.keys()] else: hundreds = [_LONG_SCALE_IT[n] for n in _LONG_SCALE_IT.keys()] # deal with negatives result = "" if num < 0: result = "meno " num = abs(num) # check for a direct match if num in number_names: if num > 90: result += "" # inizio stringa result += number_names[num] else: def _sub_thousand(n): assert 0 <= n <= 999 if n <= 19: return digits[n] elif n <= 99: q, r = divmod(n, 10) _deci = tens[q-1] _unit = r _partial = _deci if _unit > 0: if _unit == 1 or _unit == 8: _partial = _partial[:-1] # ventuno ventotto _partial += number_names[_unit] return _partial else: q, r = divmod(n, 100) if q == 1: _partial = "cento" else: _partial = digits[q] + "cento" _partial += ( " " + _sub_thousand(r) if r else "") # separa centinaia return _partial def _short_scale(n): if n >= max(_SHORT_SCALE_IT.keys()): return "numero davvero enorme" n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000)): if not z: continue number = _sub_thousand(z) if i: number += "" # separa ordini grandezza number += hundreds[i] res.append(number) return ", ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res def _long_scale(n): if n >= max(_LONG_SCALE_IT.keys()): return "numero davvero enorme" n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000000)): if not z: continue number = pronounce_number_it(z, places, True, scientific) # strip off the comma after the thousand if i: # plus one as we skip 'thousand' # (and 'hundred', but this is excluded by index value) number = number.replace(',', '') number += " " + hundreds[i+1] res.append(number) return ", ".join(reversed(res)) if short_scale: result += _short_scale(num) else: result += _long_scale(num) # normalizza unità misura singole e 'ragionevoli' ed ad inizio stringa if result == 'mila': result = 'mille' if result == 'milioni': result = 'un milione' if result == 'miliardi': result = 'un miliardo' if result[0:7] == 'unomila': result = result.replace('unomila', 'mille', 1) if result[0:10] == 'unomilioni': result = result.replace('unomilioni', 'un milione', 1) # if result[0:11] == 'unomiliardi': # result = result.replace('unomiliardi', 'un miliardo', 1) # Deal with fractional part if not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "meno " or not result): result += "zero" result += " virgola" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_it(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format adapted to italian fron en version For example, generate 'cinque e trenta' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time if use_24hour: speak = "" # Either "zero 8 zerozero" o "13 zerozero" if string[0:2] == '00': speak += "zerozero" elif string[0] == '0': speak += pronounce_number_it(int(string[0])) + " " if int(string[1]) == 1: speak = "una" else: speak += pronounce_number_it(int(string[1])) else: speak = pronounce_number_it(int(string[0:2])) # in italian "13 e 25" speak += " e " if string[3:5] == '00': speak += "zerozero" else: if string[3] == '0': speak += pronounce_number_it(0) + " " speak += pronounce_number_it(int(string[4])) else: speak += pronounce_number_it(int(string[3:5])) return speak else: if dt.hour == 0 and dt.minute == 0: return "mezzanotte" if dt.hour == 12 and dt.minute == 0: return "mezzogiorno" # TODO: "10 e un quarto", "4 e tre quarti" and ot her idiomatic times if dt.hour == 0: speak = "mezzanotte" elif dt.hour == 1 or dt.hour == 13: speak = "una" elif dt.hour > 13: # era minore speak = pronounce_number_it(dt.hour-12) else: speak = pronounce_number_it(dt.hour) speak += " e" if dt.minute == 0: speak = speak[:-2] if not use_ampm: speak += " in punto" elif dt.minute == 15: speak += " un quarto" elif dt.minute == 45: speak += " tre quarti" else: if dt.minute < 10: speak += " zero" speak += " " + pronounce_number_it(dt.minute) if use_ampm: if dt.hour < 4: speak.strip() elif dt.hour > 20: speak += " della notte" elif dt.hour > 17: speak += " della sera" elif dt.hour > 12: speak += " del pomeriggio" else: speak += " della mattina" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_nl.py000066400000000000000000000262551426211343400241100ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_nl import _NUM_POWERS_OF_TEN, \ _NUM_STRING_NL, _FRACTION_STRING_NL, _EXTRA_SPACE_NL, _MONTHS_NL from math import floor def nice_number_nl(number, speech=True, denominators=range(1, 21)): """ Dutch helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 einhalb" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)).replace(".", ",") whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_NL[den] if whole == 0: if num == 1: return_string = 'één {}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} en één {}'.format(whole, den_str) else: return_string = '{} en {} {}'.format(whole, num, den_str) return return_string def pronounce_number_nl(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # TODO short_scale, scientific and ordinals # currently ignored def pronounce_triplet_nl(num): result = "" num = floor(num) if num > 99: hundreds = floor(num / 100) if hundreds > 0: result += _NUM_STRING_NL[ hundreds] + _EXTRA_SPACE_NL + 'honderd' + _EXTRA_SPACE_NL num -= hundreds * 100 if num == 0: result += '' # do nothing elif num <= 20: result += _NUM_STRING_NL[num] # + _EXTRA_SPACE_DA elif num > 20: ones = num % 10 tens = num - ones if ones > 0: result += _NUM_STRING_NL[ones] + _EXTRA_SPACE_NL if tens > 0: result += 'en' + _EXTRA_SPACE_NL if tens > 0: result += _NUM_STRING_NL[tens] + _EXTRA_SPACE_NL return result def pronounce_fractional_nl(num, places): # fixed number of places even with # trailing zeros result = "" place = 10 while places > 0: # doesn't work with 1.0001 and places = 2: int( # number*place) % 10 > 0 and places > 0: result += " " + _NUM_STRING_NL[int(num * place) % 10] if int(num * place) % 10 == 1: result += '' # "1" is pronounced "eins" after the decimal # point place *= 10 places -= 1 return result def pronounce_whole_number_nl(num, scale_level=0): if num == 0: return '' num = floor(num) result = '' last_triplet = num % 1000 if last_triplet == 1: if scale_level == 0: if result != '': result += '' + 'één' else: result += "één" elif scale_level == 1: result += 'één' + _EXTRA_SPACE_NL + 'duizend' + _EXTRA_SPACE_NL else: result += "één " + _NUM_POWERS_OF_TEN[scale_level] + ' ' elif last_triplet > 1: result += pronounce_triplet_nl(last_triplet) if scale_level == 1: # result += _EXTRA_SPACE_DA result += 'duizend' + _EXTRA_SPACE_NL if scale_level >= 2: # if _EXTRA_SPACE_DA == '': # result += " " result += " " + _NUM_POWERS_OF_TEN[scale_level] + ' ' if scale_level >= 2: if scale_level % 2 == 0: result += "" # Miljioen result += "" # Miljard, Miljoen num = floor(num / 1000) scale_level += 1 return pronounce_whole_number_nl(num, scale_level) + result + '' result = "" if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: return str(_NUM_STRING_NL[0]) elif number < 0: return "min " + pronounce_number_nl(abs(number), places) else: if number == int(number): return pronounce_whole_number_nl(number) else: whole_number_part = floor(number) fractional_part = number - whole_number_part result += pronounce_whole_number_nl(whole_number_part) if places > 0: result += " komma" result += pronounce_fractional_nl(fractional_part, places) return result def pronounce_ordinal_nl(number): """ This function pronounces a number as an ordinal 1 -> first 2 -> second Args: number (int): the number to format Returns: (str): The pronounced number string. """ ordinals = ["nulste", "eerste", "tweede", "derde", "vierde", "vijfde", "zesde", "zevende", "achtste"] # only for whole positive numbers including zero if number < 0 or number != int(number): return number if number < 4: return ordinals[number] if number < 8: return pronounce_number_nl(number) + "de" if number < 9: return pronounce_number_nl(number) + "ste" if number < 20: return pronounce_number_nl(number) + "de" return pronounce_number_nl(number) + "ste" def nice_time_nl(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: speak += pronounce_number_nl(dt.hour) speak += " uur" if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is # "13 uur" not "13 hundred hours" speak += " " + pronounce_number_nl(dt.minute) return speak # ampm is ignored when use_24hour is true else: if dt.hour == 0 and dt.minute == 0: return "Middernacht" hour = dt.hour % 12 if dt.minute == 0: hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) speak += " uur" elif dt.minute == 30: speak += "half " hour += 1 hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) elif dt.minute == 15: speak += "kwart over " hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) elif dt.minute == 45: speak += "kwart voor " hour += 1 hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) elif dt.minute > 30: speak += pronounce_number_nl(60 - dt.minute) speak += " voor " hour += 1 hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) else: speak += pronounce_number_nl(dt.minute) speak += " over " hour = _fix_hour_nl(hour) speak += pronounce_number_nl(hour) if use_ampm: speak += nice_part_of_day_nl(dt) return speak def _fix_hour_nl(hour): hour = hour % 12 if hour == 0: hour = 12 return hour def nice_part_of_day_nl(dt, speech=True): if dt.hour < 6: return " 's nachts" if dt.hour < 12: return " 's ochtends" if dt.hour < 18: return " 's middags" if dt.hour < 24: return " 's avonds" raise ValueError('dt.hour is bigger than 24') def nice_response_nl(text): # check for months and call _nice_ordinal_nl declension of ordinals # replace "^" with "tot de macht" (to the power of) words = text.split() for idx, word in enumerate(words): if word.lower() in _MONTHS_NL: text = _nice_ordinal_nl(text) if word == '^': wordNext = words[idx + 1] if idx + 1 < len(words) else "" if wordNext.isnumeric(): words[idx] = "tot de macht" text = " ".join(words) return text def _nice_ordinal_nl(text, speech=True): # check for months for declension of ordinals before months # depending on articles/prepositions normalized_text = text words = text.split() for idx, word in enumerate(words): wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordPrev = words[idx - 1] if idx > 0 else "" if word[:-1].isdecimal(): if wordNext.lower() in _MONTHS_NL: if wordPrev == 'de': word = pronounce_ordinal_nl(int(word)) else: word = pronounce_number_nl(int(word)) words[idx] = word normalized_text = " ".join(words) return normalized_text lingua-franca-release-v0.4.3/lingua_franca/lang/format_pl.py000066400000000000000000000277501426211343400241130ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ _FRACTION_STRING_PL, _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _ALT_ORDINALS_PL from lingua_franca.internal import FunctionNotLocalizedError def nice_number_pl(number, speech=True, denominators=range(1, 21)): """ English helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 and a half" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_PL[den] if whole == 0: return_string = '{} {}'.format(num, den_str) else: return_string = '{} i {} {}'.format(whole, num, den_str) if num > 1: return_string = return_string[:-1] + 'e' return return_string def pronounce_number_pl(num, places=2, short_scale=True, scientific=False, ordinals=False, scientific_run=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # deal with infinity if num == float("inf"): return "nieskończoność" elif num == float("-inf"): return "minus nieskończoność" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: if ordinals: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} razy dziesięć do {}{} potęgi'.format( 'minus ' if float(n) < 0 else '', pronounce_number_pl( abs(float(n)), places, short_scale, False, ordinals=False, scientific_run=True), 'minus ' if power < 0 else '', pronounce_number_pl(abs(power), places, short_scale, False, ordinals=True, scientific_run=True)) else: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} razy dziesięć do potęgi {}{}'.format( 'minus ' if float(n) < 0 else '', pronounce_number_pl( abs(float(n)), places, short_scale, False), 'minus ' if power < 0 else '', pronounce_number_pl(abs(power), places, short_scale, False)) number_names = _NUM_STRING_PL.copy() number_names.update(_SHORT_SCALE_PL) digits = [number_names[n] for n in range(0, 20)] if ordinals: tens = [_SHORT_ORDINAL_PL[n] for n in range(10, 100, 10)] else: tens = [number_names[n] for n in range(10, 100, 10)] hundreds = [_SHORT_SCALE_PL[n] for n in _SHORT_SCALE_PL.keys()] # deal with negatives result = "" if num < 0: result = "minus " num = abs(num) # check for a direct match if num in number_names and not ordinals: result += number_names[num] else: def _sub_thousand(n, ordinals=False, iteration=0): assert 0 <= n <= 999 _, n_mod = divmod(n, 10) if iteration > 0 and n in _ALT_ORDINALS_PL and ordinals: return _ALT_ORDINALS_PL[n] elif n in _SHORT_ORDINAL_PL and ordinals: return _SHORT_ORDINAL_PL[n] if not scientific_run \ else _ALT_ORDINALS_PL[n] if n <= 19: return digits[n] if not scientific_run or not ordinals\ else digits[n][:-1] + "ej" elif n <= 99: q, r = divmod(n, 10) tens_text = tens[q - 1] if scientific_run: tens_text = tens_text[:-1] + "ej" return tens_text + (" " + _sub_thousand(r, ordinals) if r else "") else: q, r = divmod(n, 100) digit_name = digits[q] if q*100 in _NUM_STRING_PL: digit_name = _NUM_STRING_PL[q*100] return digit_name + ( " " + _sub_thousand(r, ordinals) if r else "") def _short_scale(n): if n >= max(_SHORT_SCALE_PL.keys()): return "nieskończoność" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000)): if not z: continue number = _sub_thousand(z, ordi, iteration=i) if i: if i >= len(hundreds): return "" number += " " if ordi: if i * 1000 in _SHORT_ORDINAL_PL: if z == 1: number = _SHORT_ORDINAL_PL[i * 1000] else: number += _SHORT_ORDINAL_PL[i * 1000] else: if n not in _SHORT_SCALE_PL: num = int("1" + "0" * (len(str(n)) - 2)) number += _SHORT_SCALE_PL[num] + "owa" else: number = _SHORT_SCALE_PL[n] + "ty" else: hundreds_text = _SHORT_SCALE_PL[float(pow(1000, i))] if z != 1: _, z_mod = divmod(z, 10) _, z_mod_tens = divmod(z, 100) n_main, _ = divmod(z_mod_tens, 10) if i == 1: if n_main != 1 and 5 > z_mod > 0: hundreds_text += "e" else: hundreds_text = "tysięcy" elif i > 1: hundreds_text += "y" if 5 > z_mod > 0 else "ów" number += hundreds_text res.append(number) ordi = False return ", ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res result += _short_scale(num) # deal with scientific notation unpronounceable as number if not result and "e" in str(num): return pronounce_number_pl(num, places, short_scale, scientific=True) # Deal with fractional part elif not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "minus " or not result): result += "zero" result += " przecinek" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_pl(dt, speech=True, use_24hour=True, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ string = dt.strftime("%H:%M") if not speech: return string # Generate a speakable version of the time speak = "" # Either "0 8 hundred" or "13 hundred" if string[0:2] == '00': speak = "" elif string[0] == '0': speak += pronounce_number_pl(int(string[1]), ordinals=True) speak = speak[:-1] + 'a' else: speak = pronounce_number_pl(int(string[0:2]), ordinals=True) speak = speak[:-1] + 'a' speak += ' ' if string[0:2] != '00' else '' if string[3:5] == '00': speak += 'zero zero' else: if string[3] == '0': speak += pronounce_number_pl(int(string[4])) else: speak += pronounce_number_pl(int(string[3:5])) if string[0:2] == '00': speak += " po północy" return speak def nice_duration_pl(duration, speech=True): """ Convert duration to a nice spoken timespan Args: seconds: number of seconds minutes: number of minutes hours: number of hours days: number of days Returns: str: timespan as a string """ # TODO this is a kludge around the fact that only Polish has a # localized nice_duration() if not speech: raise FunctionNotLocalizedError days = int(duration // 86400) hours = int(duration // 3600 % 24) minutes = int(duration // 60 % 60) seconds = int(duration % 60) out = '' sec_main, sec_div = divmod(seconds, 10) min_main, min_div = divmod(minutes, 10) hour_main, hour_div = divmod(hours, 10) if days > 0: out += pronounce_number_pl(days) + " " if days == 1: out += 'dzień' else: out += 'dni' if hours > 0: if out: out += " " out += get_pronounce_number_for_duration(hours) + " " if hours == 1: out += 'godzina' elif hour_main == 1 or hour_div > 4: out += 'godzin' else: out += 'godziny' if minutes > 0: if out: out += " " out += get_pronounce_number_for_duration(minutes) + " " if minutes == 1: out += 'minuta' elif min_main == 1 or min_div > 4: out += 'minut' else: out += 'minuty' if seconds > 0: if out: out += " " out += get_pronounce_number_for_duration(seconds) + " " if sec_div == 0: out += 'sekund' elif seconds == 1: out += 'sekunda' elif sec_main == 1 or sec_div > 4: out += 'sekund' else: out += 'sekundy' return out def get_pronounce_number_for_duration(num): pronounced = pronounce_number_pl(num) return 'jedna' if pronounced == 'jeden' else pronounced lingua-franca-release-v0.4.3/lingua_franca/lang/format_pt.py000066400000000000000000000155141426211343400241160ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \ _NUM_STRING_PT def nice_number_pt(number, speech, denominators=range(1, 21)): """ Portuguese helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 e meio" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) # denominador den_str = _FRACTION_STRING_PT[den] # fracções if whole == 0: if num == 1: # um décimo return_string = 'um {}'.format(den_str) else: # três meio return_string = '{} {}'.format(num, den_str) # inteiros >10 elif num == 1: # trinta e um return_string = '{} e {}'.format(whole, den_str) # inteiros >10 com fracções else: # vinte e 3 décimo return_string = '{} e {} {}'.format(whole, num, den_str) # plural if num > 1: return_string += 's' return return_string def pronounce_number_pt(number, places=2): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinco virgula dois' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak Returns: (str): The pronounced number """ if abs(number) >= 100: # TODO: Support n > 100 return str(number) result = "" if number < 0: result = "menos " number = abs(number) if number >= 20: tens = int(number - int(number) % 10) ones = int(number - tens) result += _NUM_STRING_PT[tens] if ones > 0: result += " e " + _NUM_STRING_PT[ones] else: result += _NUM_STRING_PT[int(number)] # Deal with decimal part, in portuguese is commonly used the comma # instead the dot. Decimal part can be written both with comma # and dot, but when pronounced, its pronounced "virgula" if not number == int(number) and places > 0: if abs(number) < 1.0 and (result == "menos " or not result): result += "zero" result += " vírgula" _num_str = str(number) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + _NUM_STRING_PT[int(char)] return result def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'cinco treinta' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: # simply speak the number if dt.hour == 1: speak += "uma" else: speak += pronounce_number_pt(dt.hour) # equivalent to "quarter past ten" if dt.minute > 0: speak += " e " + pronounce_number_pt(dt.minute) else: # speak number and add daytime identifier # (equivalent to "in the morning") if dt.minute == 35: minute = -25 hour = dt.hour + 1 elif dt.minute == 40: minute = -20 hour = dt.hour + 1 elif dt.minute == 45: minute = -15 hour = dt.hour + 1 elif dt.minute == 50: minute = -10 hour = dt.hour + 1 elif dt.minute == 55: minute = -5 hour = dt.hour + 1 else: minute = dt.minute hour = dt.hour if hour == 0: speak += "meia noite" elif hour == 12: speak += "meio dia" # 1 and 2 are pronounced in female form when talking about hours elif hour == 1 or hour == 13: speak += "uma" elif hour == 2 or hour == 14: speak += "duas" elif hour < 13: speak = pronounce_number_pt(hour) else: speak = pronounce_number_pt(hour - 12) if minute != 0: if minute == 15: speak += " e um quarto" elif minute == 30: speak += " e meia" elif minute == -15: speak += " menos um quarto" else: if minute > 0: speak += " e " + pronounce_number_pt(minute) else: speak += " " + pronounce_number_pt(minute) # exact time if minute == 0 and not use_ampm: # 3:00 speak += " em ponto" if use_ampm: if hour > 0 and hour < 6: speak += " da madrugada" elif hour >= 6 and hour < 12: speak += " da manhã" elif hour >= 13 and hour < 21: speak += " da tarde" elif hour != 0 and hour != 12: speak += " da noite" return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_ru.py000066400000000000000000000413121426211343400241140ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_ru import _NUM_STRING_RU, \ _FRACTION_STRING_RU, _LONG_SCALE_RU, _SHORT_SCALE_RU, _SHORT_ORDINAL_RU, _LONG_ORDINAL_RU from lingua_franca.internal import FunctionNotLocalizedError def nice_number_ru(number, speech=True, denominators=range(1, 21)): """ English helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 and a half" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_RU[den] if whole == 0: if num == 1 and den <= 4: return_string = '{}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1 and den == 2: return_string = '{} с половиной'.format(whole) else: return_string = '{} и {} {}'.format(whole, num, den_str) if 2 <= den <= 4: if 2 <= num <= 4: return_string = return_string[:-1] + 'и' elif num > 4: return_string = return_string[:-1] + 'ей' elif den >= 5: if 2 <= num <= 4: return_string = return_string[:-2] + 'ые' elif num > 4: return_string = return_string[:-2] + 'ых' return return_string def pronounce_number_ru(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: number(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ num = number # deal with infinity if num == float("inf"): return "бесконечность" elif num == float("-inf"): return "минус бесконечность" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: if ordinals: # This handles negative powers separately from the normal # handling since each call disables the scientific flag return '{}{} на десять в {}{} степени'.format( 'минус ' if float(n) < 0 else '', pronounce_number_ru( abs(float(n)), places, short_scale, False, ordinals=True), 'минус ' if power < 0 else '', pronounce_number_ru(abs(power), places, short_scale, False, ordinals=True)) else: # This handles negative powers separately from the normal # handling since each call disables the scientific flag return '{}{} на десять в степени {}{}'.format( 'минус ' if float(n) < 0 else '', pronounce_number_ru( abs(float(n)), places, short_scale, False, ordinals=False), 'минус ' if power < 0 else '', pronounce_number_ru(abs(power), places, short_scale, False, ordinals=False)) if short_scale: number_names = _NUM_STRING_RU.copy() number_names.update(_SHORT_SCALE_RU) else: number_names = _NUM_STRING_RU.copy() number_names.update(_LONG_SCALE_RU) digits = [number_names[n] for n in range(0, 20)] tens = [number_names[n] for n in range(10, 100, 10)] if short_scale: hundreds = [_SHORT_SCALE_RU[n] for n in _SHORT_SCALE_RU.keys()] else: hundreds = [_LONG_SCALE_RU[n] for n in _LONG_SCALE_RU.keys()] # deal with negative numbers result = "" if num < 0: result = "минус " num = abs(num) # check for a direct match if num in number_names and not ordinals: result += number_names[num] else: def _sub_thousand(n, ordinals=False): assert 0 <= n <= 999 if n in _SHORT_ORDINAL_RU and ordinals: return _SHORT_ORDINAL_RU[n] if n <= 19: return digits[n] elif n <= 99: q, r = divmod(n, 10) return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r else "") else: q, r = divmod(n, 100) return _NUM_STRING_RU[q * 100] + (" " + _sub_thousand(r, ordinals) if r else "") def _short_scale(n): if n > max(_SHORT_SCALE_RU.keys()): return "бесконечность" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000)): if not z: continue number = _sub_thousand(z, not i and ordi) if i: if i >= len(hundreds): return "" if ordi: if i * 1000 in _SHORT_ORDINAL_RU: if z == 1: number = _SHORT_ORDINAL_RU[i * 1000] else: if z > 5: number = number[:-1] + "и" number += _SHORT_ORDINAL_RU[i * 1000] else: if n not in _SHORT_SCALE_RU: num = int("1" + "0" * (len(str(n)) // 3 * 3)) if number[-3:] == "два": number = number[:-1] + "ух" elif number[-2:] == "ри" or number[-2:] == "ре": number = number[:-1] + "ёх" elif number[-1:] == "ь": number = number[:-1] + "и" number += _SHORT_SCALE_RU[num] + "ный" else: number = _SHORT_SCALE_RU[n] + "ный" elif z == 1: number = hundreds[i - 1] else: if i == 1: if z % 10 == 1 and z % 100 // 10 != 1: number = number[:-2] + "на" elif z % 10 == 2 and z % 100 // 10 != 1: number = number[:-1] + "е" number += " " + plural_ru(z, "тысяча", "тысячи", "тысяч") elif 1 <= z % 10 <= 4 and z % 100 // 10 != 1: number += " " + hundreds[i - 1] + "а" else: number += " " + hundreds[i - 1] + "ов" res.append(number) ordi = False return " ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res def _long_scale(n): if n >= max(_LONG_SCALE_RU.keys()): return "бесконечность" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] for i, z in enumerate(_split_by(n, 1000000)): if not z: continue number = pronounce_number_ru(z, places, True, scientific, ordinals=ordi and not i) # strip off the comma after the thousand if i: if i >= len(hundreds): return "" # plus one as we skip 'thousand' # (and 'hundred', but this is excluded by index value) number = number.replace(',', '') if ordi: if (i + 1) * 1000000 in _LONG_ORDINAL_RU: if z == 1: number = _LONG_ORDINAL_RU[ (i + 1) * 1000000] else: number += _LONG_ORDINAL_RU[ (i + 1) * 1000000] else: if n not in _LONG_SCALE_RU: num = int("1" + "0" * (len(str(n)) // 3 * 3)) if number[-3:] == "два": number = number[:-1] + "ух" elif number[-2:] == "ри" or number[-2:] == "ре": number = number[:-1] + "ёх" elif number[-1:] == "ь": number = number[:-1] + "и" number += _LONG_SCALE_RU[num] + "ный" else: number = " " + _LONG_SCALE_RU[n] + "ный" elif z == 1: number = hundreds[i] elif z <= 4: number += " " + hundreds[i] + "а" else: number += " " + hundreds[i] + "ов" res.append(number) return " ".join(reversed(res)) if short_scale: result += _short_scale(num) else: result += _long_scale(num) # deal with scientific notation unpronounceable as number if not result and "e" in str(num): return pronounce_number_ru(num, places, short_scale, scientific=True) # Deal with fractional part elif not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "минус " or not result): result += "ноль" result += " точка" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_ru(dt, speech=True, use_24hour=True, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M") if dt.hour < 4: string += " ночи" elif dt.hour < 12: string += " утра" elif dt.hour < 18: string += " дня" else: string += " вечера" else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string # Generate a speakable version of the time if use_24hour: speak = "" # Either "0 8 hundred" or "13 hundred" if string[0] == '0': speak += pronounce_hour_ru(int(string[0])) + " " speak += pronounce_number_ru(int(string[1])) else: speak = pronounce_hour_ru(int(string[0:2])) speak += " " if string[3:5] == '00': speak += "ровно" else: if string[3] == '0': speak += pronounce_number_ru(0) + " " speak += pronounce_number_ru(int(string[4])) else: speak += pronounce_number_ru(int(string[3:5])) return speak else: if dt.hour == 0 and dt.minute == 0: return "полночь" elif dt.hour == 12 and dt.minute == 0: return "полдень" hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if dt.minute == 15: speak = pronounce_hour_ru(hour) + " с четвертью" elif dt.minute == 30: speak = pronounce_hour_ru(hour) + " с половиной" elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 speak = "без четверти " + pronounce_hour_ru(next_hour) else: speak = pronounce_hour_ru(hour) if dt.minute == 0: if not use_ampm: if dt.hour % 12 == 1: return speak return speak + " " + plural_ru(dt.hour % 12, "час", "часа", "часов") else: if dt.minute < 10: speak += " ноль" speak += " " + pronounce_number_ru(dt.minute) if use_ampm: if dt.hour < 4: speak += " ночи" elif dt.hour < 12: speak += " утра" elif dt.hour < 18: speak += " дня" else: speak += " вечера" return speak def nice_duration_ru(duration, speech=True): """ Convert duration to a nice spoken timespan Args: seconds: number of seconds minutes: number of minutes hours: number of hours days: number of days Returns: str: timespan as a string """ if not speech: raise FunctionNotLocalizedError days = int(duration // 86400) hours = int(duration // 3600 % 24) minutes = int(duration // 60 % 60) seconds = int(duration % 60) out = '' if days > 0: out += pronounce_number_ru(days) out += " " + plural_ru(days, "день", "дня", "дней") if hours > 0: if out: out += " " out += pronounce_number_ru(hours) out += " " + plural_ru(hours, "час", "часа", "часов") if minutes > 0: if out: out += " " out += pronounce_number_feminine_ru(minutes) out += " " + plural_ru(minutes, "минута", "минуты", "минут") if seconds > 0: if out: out += " " out += pronounce_number_feminine_ru(seconds) out += " " + plural_ru(seconds, "секунда", "секунды", "секунд") return out def pronounce_hour_ru(num): if num == 1: return "час" return pronounce_number_ru(num) def pronounce_number_feminine_ru(num): pronounced = pronounce_number_ru(num) num %= 100 if num % 10 == 1 and num // 10 != 1: return pronounced[:-2] + "на" elif num % 10 == 2 and num // 10 != 1: return pronounced[:-1] + "е" return pronounced def plural_ru(num: int, one: str, few: str, many: str): num %= 100 if num // 10 == 1: return many if num % 10 == 1: return one if 2 <= num % 10 <= 4: return few return many lingua-franca-release-v0.4.3/lingua_franca/lang/format_sl.py000066400000000000000000000336541426211343400241160ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.lang.common_data_sl import _NUM_STRING_SL, \ _FRACTION_STRING_SL, _LONG_SCALE_SL, _SHORT_SCALE_SL, _SHORT_ORDINAL_SL from lingua_franca.lang.format_common import convert_to_mixed_fraction def nice_number_sl(number, speech=True, denominators=range(1, 21)): """ Slovenian helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "2 in polovica" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_SL[den] if whole == 0: return_string = '{} {}'.format(num, den_str) else: return_string = '{} in {} {}'.format(whole, num, den_str) if num % 100 == 1: pass elif num % 100 == 2: return_string = return_string[:-1] + 'i' elif num % 100 == 3 or num % 100 == 4: return_string = return_string[:-1] + 'e' else: return_string = return_string[:-1] return return_string def pronounce_number_sl(num, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'pet celih dve' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # deal with infinity if num == float("inf"): return "neskončno" elif num == float("-inf"): return "minus neskončno" if scientific: number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: if ordinals: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} krat deset na {}{}'.format( 'minus ' if float(n) < 0 else '', pronounce_number_sl( abs(float(n)), places, short_scale, False, ordinals=False), 'minus ' if power < 0 else '', pronounce_number_sl(abs(power), places, short_scale, False, ordinals=True)) else: # This handles negatives of powers separately from the normal # handling since each call disables the scientific flag return '{}{} krat deset na {}{}'.format( 'minus ' if float(n) < 0 else '', pronounce_number_sl( abs(float(n)), places, short_scale, False), 'minus ' if power < 0 else '', pronounce_number_sl(abs(power), places, short_scale, False)) if short_scale: number_names = _NUM_STRING_SL.copy() number_names.update(_SHORT_SCALE_SL) else: number_names = _NUM_STRING_SL.copy() number_names.update(_LONG_SCALE_SL) digits = [number_names[n] for n in range(0, 20)] tens = [number_names[n] for n in range(10, 100, 10)] if short_scale: hundreds = [_SHORT_SCALE_SL[n] for n in _SHORT_SCALE_SL.keys()] else: hundreds = [_LONG_SCALE_SL[n] for n in _LONG_SCALE_SL.keys()] # deal with negatives result = "" if num < 0: result = "minus " num = abs(num) # check for a direct match if num in number_names and not ordinals: result += number_names[num] else: def _sub_thousand(n, ordinals=False, is_male=False): assert 0 <= n <= 999 if n in _SHORT_ORDINAL_SL and ordinals: return _SHORT_ORDINAL_SL[n] if n <= 19: if is_male and n == 2: return digits[n][:-1] + "a" return digits[n] elif n <= 99: q, r = divmod(n, 10) sub = _sub_thousand(r, False) if r == 2: sub = sub[:-1] + "a" return ((sub + "in") if r else "") + ( tens[q - 1]) + ("i" if ordinals else "") else: q, r = divmod(n, 100) if q == 1: qstr = "" else: qstr = digits[q] return (qstr + "sto" + ( " " + _sub_thousand(r, ordinals) if r else "")) def _plural_hundreds(n, hundred, ordi=True): if hundred[-3:] != "jon": if ordi: return hundred + "i" return hundred if n < 1000 or short_scale: if ordi: return hundred + "ti" if n % 100 == 1: return hundred elif n % 100 == 2: return hundred + "a" elif n % 100 == 3 or n % 100 == 4: return hundred + "i" else: return hundred + "ov" else: n //= 1000 if ordi: return hundred[:-3] + "jardti" if n % 100 == 1: return hundred[:-3] + "jarda" elif n % 100 == 2: return hundred[:-3] + "jardi" elif n % 100 == 3 or n % 100 == 4: return hundred[:-3] + "jarde" else: return hundred[:-3] + "jard" def _short_scale(n): if n >= max(_SHORT_SCALE_SL.keys()): return "neskončno" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] split = _split_by(n, 1000) if ordinals and len([a for a in split if a > 0]) == 1: ordi_force = True else: ordi_force = False for i, z in enumerate(split): if not z: continue if z == 1 and i == 1: number = "" elif z > 100 and z % 100 == 2: number = _sub_thousand(z, not i and ordi, is_male=True) elif z > 100 and z % 100 == 3: number = _sub_thousand(z, not i and ordi) + "je" elif z > 1 or i == 0 or ordi: number = _sub_thousand(z, not i and ordi) else: number = "" if i: if i >= len(hundreds): return "" if z > 1: number += " " number += _plural_hundreds( z, hundreds[i], True if ordi_force else not i and ordi) res.append(number) ordi = False return " ".join(reversed(res)) def _split_by(n, split=1000): assert 0 <= n res = [] while n: n, r = divmod(n, split) res.append(r) return res def _long_scale(n): if n >= max(_LONG_SCALE_SL.keys()): return "neskončno" ordi = ordinals if int(n) != n: ordi = False n = int(n) assert 0 <= n res = [] split = _split_by(n, 1000000) if ordinals and len([a for a in split if a > 0]) == 1: ordi_force = True else: ordi_force = False for i, z in enumerate(split): if not z: continue number = pronounce_number_sl(z, places, True, scientific) if z > 100: add = number.split()[0] + " " else: add = "" if z % 100 == 2 and i >= 1: number = add + digits[2][:-1] + "a" if z % 100 == 3 and i >= 1: number = add + digits[3] + "je" # strip off the comma after the thousand if i: if i >= len(hundreds): return "" # plus one as we skip 'thousand' # (and 'hundred', but this is excluded by index value) hundred = _plural_hundreds( z, hundreds[i + 1], True if ordi_force else ordi and not i) if z >= 1000: z //= 1000 number = pronounce_number_sl(z, places, True, scientific, ordinals=True if ordi_force else ordi and not i) if z == 1: number = hundred else: number += " " + hundred res.append(number) return " ".join(reversed(res)) if short_scale: result += _short_scale(num) else: result += _long_scale(num) if ordinals: result = result.replace(" ", "") # deal with scientific notation unpronounceable as number if (not result or result == "neskončno") and "e" in str(num): return pronounce_number_sl(num, places, short_scale, scientific=True) # Deal with fractional part elif not num == int(num) and places > 0: if abs(num) < 1.0 and (result == "minus " or not result): result += "nič" if int(abs(num)) % 100 == 1: result += " cela" elif int(abs(num)) % 100 == 2: result += " celi" elif int(abs(num)) % 100 == 3 or int(abs(num)) % 100 == 4: result += " cele" else: result += " celih" _num_str = str(num) _num_str = _num_str.split(".")[1][0:places] for char in _num_str: result += " " + number_names[int(char)] return result def nice_time_sl(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'pet trideset' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if string[0] == '0': string = string[1:] # strip leading zeros if not speech: return string def _hour_declension(hour): speak = pronounce_number_sl(hour) if hour == 1: return speak[:-1] + "ih" elif hour == 2 or hour == 4: return speak + "h" elif hour == 3: return speak[:-1] + "eh" elif hour == 7 or hour == 8: return speak[:-2] + "mih" else: return speak + "ih" # Generate a speakable version of the time if use_24hour: # "13 nič nič" speak = pronounce_number_sl(int(string[0:2])) speak += " " if string[3:5] == '00': speak += "nič nič" else: if string[3] == '0': speak += pronounce_number_sl(0) + " " speak += pronounce_number_sl(int(string[4])) else: speak += pronounce_number_sl(int(string[3:5])) return speak else: if dt.hour == 0 and dt.minute == 0: return "polnoč" elif dt.hour == 12 and dt.minute == 0: return "poldne" hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 if dt.minute == 0: speak = pronounce_number_sl(hour) elif dt.minute < 30: speak = pronounce_number_sl( dt.minute) + " čez " + pronounce_number_sl(hour) elif dt.minute == 30: next_hour = (dt.hour + 1) % 12 or 12 speak = "pol " + _hour_declension(next_hour) elif dt.minute > 30: next_hour = (dt.hour + 1) % 12 or 12 speak = pronounce_number_sl( 60 - dt.minute) + " do " + _hour_declension(next_hour) if use_ampm: if dt.hour > 11: speak += " p.m." else: speak += " a.m." return speak lingua-franca-release-v0.4.3/lingua_franca/lang/format_sv.py000066400000000000000000000275141426211343400241260ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .format_common import convert_to_mixed_fraction from lingua_franca.lang.common_data_sv import _EXTRA_SPACE_SV, \ _FRACTION_STRING_SV, _MONTHS_SV, _NUM_POWERS_OF_TEN_SV, _NUM_STRING_SV from math import floor def nice_number_sv(number, speech=True, denominators=range(1, 21)): """ Swedish helper for nice_number This function formats a float to human understandable functions. Like 4.5 becomes "4 och en halv" for speech and "4 1/2" for text Args: number (int or float): the float to format speech (bool): format for speech (True) or display (False) denominators (iter of ints): denominators to use, default [1 .. 20] Returns: (str): The formatted string. """ result = convert_to_mixed_fraction(number, denominators) if not result: # Give up, just represent as a 3 decimal number return str(round(number, 3)) whole, num, den = result if not speech: if num == 0: # TODO: Number grouping? E.g. "1,000,000" return str(whole) else: return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) den_str = _FRACTION_STRING_SV[den] if whole == 0: if num == 1: return_string = 'en {}'.format(den_str) else: return_string = '{} {}'.format(num, den_str) elif num == 1: return_string = '{} och en {}'.format(whole, den_str) else: return_string = '{} och {} {}'.format(whole, num, den_str) if num > 1: return_string += 'ar' return return_string def pronounce_number_sv(number, places=2, short_scale=True, scientific=False, ordinals=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'five point two' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak short_scale (bool) : use short (True) or long scale (False) https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: (str): The pronounced number """ # TODO short_scale, scientific and ordinals # currently ignored def pronounce_triplet_sv(num): result = "" num = floor(num) if num > 99: hundreds = floor(num / 100) if hundreds > 0: if hundreds == 1: result += 'ett' + 'hundra' else: result += _NUM_STRING_SV[hundreds] + 'hundra' num -= hundreds * 100 if num == 0: result += '' # do nothing elif num == 1: result += 'ett' elif num <= 20: result += _NUM_STRING_SV[num] elif num > 20: tens = num % 10 ones = num - tens if ones > 0: result += _NUM_STRING_SV[ones] if tens > 0: result += _NUM_STRING_SV[tens] return result def pronounce_fractional_sv(num, places): # fixed number of places even with trailing zeros result = "" place = 10 while places > 0: # doesn't work with 1.0001 and places = 2: int( # num*place) % 10 > 0 and places > 0: result += " " + _NUM_STRING_SV[int(num * place) % 10] place *= 10 places -= 1 return result def pronounce_whole_number_sv(num, scale_level=0): if num == 0: return '' num = floor(num) result = '' last_triplet = num % 1000 if last_triplet == 1: if scale_level == 0: if result != '': result += '' + 'ett' else: result += 'en' elif scale_level == 1: result += 'ettusen' + _EXTRA_SPACE_SV else: result += 'en ' + \ _NUM_POWERS_OF_TEN_SV[scale_level] + _EXTRA_SPACE_SV elif last_triplet > 1: result += pronounce_triplet_sv(last_triplet) if scale_level == 1: result += 'tusen' + _EXTRA_SPACE_SV if scale_level >= 2: result += _NUM_POWERS_OF_TEN_SV[scale_level] if scale_level >= 2: result += 'er' + _EXTRA_SPACE_SV # MiljonER num = floor(num / 1000) scale_level += 1 return pronounce_whole_number_sv(num, scale_level) + result result = "" if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: return str(_NUM_STRING_SV[0]) elif number < 0: return "minus " + pronounce_number_sv(abs(number), places) else: if number == int(number): return pronounce_whole_number_sv(number) else: whole_number_part = floor(number) fractional_part = number - whole_number_part result += pronounce_whole_number_sv(whole_number_part) if places > 0: result += " komma" result += pronounce_fractional_sv(fractional_part, places) return result def pronounce_ordinal_sv(number): """ This function pronounces a number as an ordinal 1 -> first 2 -> second Args: number (int): the number to format Returns: (str): The pronounced number string. """ # ordinals for 1, 3, 7 and 8 are irregular # this produces the base form, it will have to be adapted for genus, # casus, numerus ordinals = ["noll", "första", "andra", "tredje", "fjärde", "femte", "sjätte", "sjunde", "åttonde", "nionde", "tionde"] tens = int(floor(number / 10.0)) * 10 ones = number % 10 if number < 0 or number != int(number): return number if number == 0: return ordinals[number] result = "" if number > 10: result += pronounce_number_sv(tens).rstrip() if ones > 0: result += ordinals[ones] else: result += 'de' return result def nice_time_sv(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format For example, generate 'five thirty' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) speech (bool): format for speech (default/True) or display (False)=Fal use_24hour (bool): output in 24-hour/military or 12-hour format use_ampm (bool): include the am/pm for 12-hour format Returns: (str): The formatted time string """ if use_24hour: # e.g. "03:01" or "14:22" string = dt.strftime("%H:%M") else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" string = dt.strftime("%I:%M %p") else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") if not speech: return string # Generate a speakable version of the time speak = "" if use_24hour: if dt.hour == 1: speak += "ett" # 01:00 is "ett" not "en" else: speak += pronounce_number_sv(dt.hour) if not dt.minute == 0: if dt.minute < 10: speak += ' noll' if dt.minute == 1: speak += ' ett' else: speak += " " + pronounce_number_sv(dt.minute) return speak # ampm is ignored when use_24hour is true else: hour = dt.hour if not dt.minute == 0: if dt.minute < 30: if dt.minute != 15: speak += pronounce_number_sv(dt.minute) else: speak += 'kvart' if dt.minute == 1: speak += ' minut över ' elif dt.minute != 10 and dt.minute != 5 and dt.minute != 15: speak += ' minuter över ' else: speak += ' över ' elif dt.minute > 30: if dt.minute != 45: speak += pronounce_number_sv((60 - dt.minute)) else: speak += 'kvart' if dt.minute == 1: speak += ' minut i ' elif dt.minute != 50 and dt.minute != 55 and dt.minute != 45: speak += ' minuter i ' else: speak += ' i ' hour = (hour + 1) % 12 elif dt.minute == 30: speak += 'halv ' hour = (hour + 1) % 12 if hour == 0 and dt.minute == 0: return "midnatt" if hour == 12 and dt.minute == 0: return "middag" # TODO: "half past 3", "a quarter of 4" and other idiomatic times if hour == 0: speak += pronounce_number_sv(12) elif hour <= 13: if hour == 1 or hour == 13: # 01:00 and 13:00 is "ett" speak += 'ett' else: speak += pronounce_number_sv(hour) else: speak += pronounce_number_sv(hour - 12) if use_ampm: if dt.hour > 11: if dt.hour < 18: # 12:01 - 17:59 nachmittags/afternoon speak += " på eftermiddagen" elif dt.hour < 22: # 18:00 - 21:59 abends/evening speak += " på kvällen" else: # 22:00 - 23:59 nachts/at night speak += " på natten" elif dt.hour < 3: # 00:01 - 02:59 nachts/at night speak += " på natten" else: # 03:00 - 11:59 morgens/in the morning speak += " på morgonen" return speak def nice_response_sv(text): # check for months and call _nice_ordinal_sv declension of ordinals # replace "^" with "hoch" (to the power of) words = text.split() for idx, word in enumerate(words): if word.lower() in _MONTHS_SV: text = _nice_ordinal_sv(text) if word == '^': wordNext = words[idx + 1] if idx + 1 < len(words) else "" if wordNext.isnumeric(): words[idx] = "upphöjt till" text = " ".join(words) return text def _nice_ordinal_sv(text, speech=True): # check for months for declension of ordinals before months # depending on articles/prepositions normalized_text = text words = text.split() for idx, word in enumerate(words): wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordPrev = words[idx - 1] if idx > 0 else "" if word[-1:] == ".": if word[:-1].isdecimal(): if wordNext.lower() in _MONTHS_SV: word = pronounce_ordinal_sv(int(word[:-1])) if wordPrev.lower() in ["om", "den", "från", "till", "(från", "(om", "till"]: word += "n" elif wordPrev.lower() not in ["den"]: word += "r" words[idx] = word normalized_text = " ".join(words) return normalized_text lingua-franca-release-v0.4.3/lingua_franca/lang/parse_ca.py000066400000000000000000001232621426211343400237000ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parse functions for Catalan (ca-ES) TODO: numbers greater than 999999 TODO: date time ca """ from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local from lingua_franca.lang.parse_common import is_numeric, look_for_fractions from lingua_franca.lang.common_data_ca import _NUMBERS_CA, \ _FEMALE_DETERMINANTS_CA, _FEMALE_ENDINGS_CA, \ _MALE_DETERMINANTS_CA, _MALE_ENDINGS_CA, _GENDERS_CA, \ _TENS_CA, _AFTER_TENS_CA, _HUNDREDS_CA, _BEFORE_HUNDREDS_CA from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer import json import re def is_fractional_ca(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('é', -1): input_str = input_str[:len(input_str) - 1] + "è" # e.g. "cinqué -> cinquè" elif input_str.endswith('ena', -3): input_str = input_str[:len(input_str) - 3] + "è" # e.g. "cinquena -> cinquè" elif input_str.endswith('ens', -3): input_str = input_str[:len(input_str) - 3] + "è" # e.g. "cinquens -> cinquè" elif input_str.endswith('enes', -4): input_str = input_str[:len(input_str) - 4] + "è" # e.g. "cinquenes -> cinquè" elif input_str.endswith('os', -2): input_str = input_str[:len(input_str) - 2] # e.g. "terços -> terç" elif (input_str == 'terceres' or input_str == 'tercera'): input_str = "terç" # e.g. "tercer -> terç" elif (input_str == 'mitges' or input_str == 'mitja'): input_str = "mig" # e.g. "mitges -> mig" elif (input_str == 'meitat' or input_str == 'meitats'): input_str = "mig" # e.g. "mitges -> mig" elif input_str.endswith('a', -1): input_str = input_str[:len(input_str) - 1] # e.g. "quarta -> quart" elif input_str.endswith('es', -2): input_str = input_str[:len(input_str) - 2] # e.g. "quartes -> quartes" elif input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "quarts -> quart" aFrac = ["mig", "terç", "quart", "cinquè", "sisè", "sètè", "vuitè", "novè", "desè", "onzè", "dotzè", "tretzè", "catorzè", "quinzè", "setzè", "dissetè", "divuitè", "dinovè"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 2) if input_str == "vintè": return 1.0 / 20 if input_str == "trentè": return 1.0 / 30 if input_str == "centè": return 1.0 / 100 if input_str == "milè": return 1.0 / 1000 if (input_str == "vuitè" or input_str == "huitè"): return 1.0 / 8 if (input_str == "divuitè" or input_str == "dihuitè"): return 1.0 / 18 return False def extract_number_ca(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. text = text.lower() aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _NUMBERS_CA: val = _NUMBERS_CA[word] elif '-' in word: wordparts = word.split('-') # trenta-cinc > 35 if len(wordparts) == 2 and (wordparts[0] in _TENS_CA and wordparts[1] in _AFTER_TENS_CA): val = _TENS_CA[wordparts[0]] + _AFTER_TENS_CA[wordparts[1]] # vint-i-dues > 22 elif len(wordparts) == 3 and wordparts[1] == 'i' and (wordparts[0] in _TENS_CA and wordparts[2] in _AFTER_TENS_CA): val = _TENS_CA[wordparts[0]]+_AFTER_TENS_CA[wordparts[2]] # quatre-centes > 400 elif len(wordparts) == 2 and (wordparts[0] in _BEFORE_HUNDREDS_CA and wordparts[1] in _HUNDREDS_CA): val = _BEFORE_HUNDREDS_CA[wordparts[0]]*100 elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif is_fractional_ca(word): if not result: result = 1 result = result * is_fractional_ca(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions #TODO: caution, review use of "ens" word if next_word != "ens": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["i"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_ca(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_ca(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["coma", "amb", "punt", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extract_number_ca(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False class CatalanNormalizer(Normalizer): with open(resolve_resource_file("text/ca-es/normalize.json")) as f: _default_config = json.load(f) @staticmethod def tokenize(utterance): # Split things like 12% utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) # Split things like #1 utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) # Don't split things like amo-te #utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \3", # utterance) tokens = utterance.split() if tokens[-1] == '-': tokens = tokens[:-1] return tokens def normalize_ca(text, remove_articles=True): """ CA string normalization """ return CatalanNormalizer().normalize(text, remove_articles) def extract_datetime_ca(text, anchorDate=None, default_time=None): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", "º", "ª"] hyphens = ["'", "_"] noise_words = ["el", "l", "els", "la", "les", "es", "sa", "ses", "d", "de", "del", "dels"] # add final space s = s + " " s = s.lower() for word in symbols: s = s.replace(word, "") for word in hyphens: s = s.replace(word, " ") for word in noise_words: s = s.replace(" " + word + " ", " ") # handle synonims, plurals and equivalents, "demà ben d'hora" = "demà de matí" synonims = {"abans": ["abans-d"], "vinent": ["que vé", "que ve", "que bé", "que be"], "migdia": ["mig dia"], "mitjanit": ["mitja nit"], "matinada": ["matinades", "ben hora ben hora"], "matí": ["matins", "dematí", "dematins", "ben hora"], "tarda": ["tardes", "vesprada", "vesprades", "vespraes"], "nit": ["nits", "vespre", "vespres", "horabaixa", "capvespre"], "demà": ["endemà"], "diàriament": ["diària", "diàries", "cada dia", "tots dies"], "setmanalment": ["setmanal", "setmanals", "cada setmana", "totes setmanes"], "quinzenalment": ["quinzenal", "quinzenals", "cada quinzena", "totes quinzenes"], "mensualment": ["mensual", "mensuals", "cada mes", "tots mesos"], "anualment": ["anual", "anuals", "cada any", "tots anys"], "demàpassat": ["demà-passat", "demà passat", "passat demà", "despús-demà", "despús demà"], "demàpassatpassat": ["demàpassat passat", "passat demàpassat", "demàpassat no altre", "demàpassat altre"], "abansahir": ["abans ahir", "despús ahir", "despús-ahir"], "abansabansahir": ["abans abansahir", "abansahir no altre", "abansahir altre", "abansahir no altre", "abansahir altre"], "segon": ["segons"], "minut": ["minuts"], "quart": ["quarts"], "hora": ["hores"], "dia": ["dies"], "setmana": ["setmanes"], "quinzena": ["quinzenes"], "mes": ["mesos"], "any": ["anys"], "tocat": ["tocats"], "a": ["al", "als"] } for syn in synonims: for word in synonims[syn]: s = s.replace(" " + word + " ", " " + syn + " ") # remove final space if s[-1] == " ": s = s[:-1] return s def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" words = clean_string(text).split(" ") timeQualifiersList = ['matí', 'tarda', 'nit'] time_indicators = ["em", "a", "a les", "cap a", "vora", "després", "estas", "no", "dia", "hora"] days = ['dilluns', 'dimarts', 'dimecres', 'dijous', 'divendres', 'dissabte', 'diumenge'] months = ['gener', 'febrer', 'març', 'abril', 'maig', 'juny', 'juliol', 'agost', 'setembre', 'octubre', 'novembre', 'desembre'] monthsShort = ['gen', 'feb', 'març', 'abr', 'maig', 'juny', 'jul', 'ag', 'set', 'oct', 'nov', 'des'] nexts = ["pròxim", "pròxima", "vinent"] suffix_nexts = ["següent", "després"] lasts = ["últim", "última", "darrer", "darrera", "passat", "passada"] suffix_lasts = ["passada", "passat", "anterior", "abans"] nxts = ["passat", "després", "segueix", "seguit", "seguida", "següent", "pròxim", "pròxima"] prevs = ["abans", "prèvia", "previamente", "anterior"] froms = ["partir", "dins", "des", "a", "després", "pròxima", "pròxim", "del", "de"] thises = ["aquest", "aquesta", "aqueix", "aqueixa", "este", "esta"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, yesterday elif word == "avui" and not fromFlag: dayOffset = 0 used += 1 elif word == "demà" and not fromFlag: dayOffset += 1 used += 1 elif word == "ahir" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" elif (word == "abansahir") and not fromFlag: dayOffset -= 2 used += 1 elif word == "abansabansahir" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow and after after tomorrow elif word == "demàpassat" and not fromFlag: dayOffset += 2 used = 1 elif word == "demàpassatpassat" and not fromFlag: dayOffset += 3 used = 1 # parse 5 days, 10 weeks, last week, next week, week after elif word == "dia": if wordNext == "després" or wordNext == "abans": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used += 1 elif (wordPrev and wordPrev[0].isdigit() and wordNext not in months and wordNext not in monthsShort): dayOffset += int(wordPrev) start -= 1 used += 2 elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ months and wordNextNext not in monthsShort: dayOffset += int(wordNext) start -= 1 used += 2 elif word == "setmana" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 for w in nexts: if wordPrev == w: dayOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: dayOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: dayOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "mes" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: monthOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: monthOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: monthOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: monthOffset = -7 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "any" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: yearOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: yearOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: yearOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: yearOffset = -7 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 for w in nexts: if wordPrev == w: dayOffset += 7 used += 1 start -= 1 for w in lasts: if wordPrev == w: dayOffset -= 7 used += 1 start -= 1 for w in suffix_nexts: if wordNext == w: dayOffset += 7 used += 1 start -= 1 for w in suffix_lasts: if wordNext == w: dayOffset -= 7 used += 1 start -= 1 if wordNext == "feira": used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): # 13 maig datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): # maig 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): # 13 dia maig datestr += " " + wordPrevPrev start -= 2 used += 2 if wordNext and word[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): # maig dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): datestr += " " + wordNextNextNext used += 1 hasYear = True else: hasYear = False if datestr in months: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("avui") validFollowups.append("demà") validFollowups.append("ahir") validFollowups.append("abansahir") validFollowups.append("abansabansahir") validFollowups.append("demàpassat") validFollowups.append("ara") validFollowups.append("ja") validFollowups.append("abans") # TODO debug word "passat" that one is failing for some reason if word in froms and wordNext in validFollowups: if not (wordNext == "demà" and wordNext == "ahir") and not ( word == "passat" or word == "abans" or word == "em"): used = 2 fromFlag = True if wordNext == "demà": dayOffset += 1 elif wordNext == "ahir": dayOffset -= 1 elif wordNext == "abansahir": dayOffset -= 2 elif wordNext == "abansabansahir": dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if wordNextNext == "dia": used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: if wordNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNextNextNext: if wordNextNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset if wordNextNextNext == "dia": used += 1 if wordNext in months: used -= 1 if used > 0: if start - 1 > 0 and words[start - 1] in lists: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in lists: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "migdia": hrAbs = 12 used += 1 elif word == "mijanit": hrAbs = 0 used += 1 elif word == "matí": if not hrAbs: hrAbs = 8 used += 1 elif word == "tarda": if not hrAbs: hrAbs = 15 used += 1 elif word == "mitja" and wordNext == "tarda": if not hrAbs: hrAbs = 17 used += 2 elif word == "mig" and wordNext == "matí": if not hrAbs: hrAbs = 10 used += 2 elif word == "vespre" or (word == "final" and wordNext == "tarda"): if not hrAbs: hrAbs = 19 used += 2 elif word == "final" and wordNext == "matí": if not hrAbs: hrAbs = 11 used += 2 elif word == "matinada": if not hrAbs: hrAbs = 4 used += 1 elif word == "nit": if not hrAbs: hrAbs = 22 used += 1 # parse half an hour, quarter hour elif word == "hora" and \ (wordPrev in time_indicators or wordPrevPrev in time_indicators): if wordPrev == "mitja": minOffset = 30 elif wordPrev == "quart": minOffset = 15 elif wordPrevPrev == "quart": minOffset = 15 if idx > 2 and words[idx - 3] in time_indicators: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in time_indicators: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "matí": remainder = "am" used += 1 elif (wordNext == "tarda" or wordNext == "vespre"): remainder = "pm" used += 1 elif wordNext == "nit": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext in thises and wordNextNext == "matí": remainder = "am" used = 2 elif wordNext in thises and (wordNextNext == "tarda" or wordNextNext == "vespre"): remainder = "pm" used = 2 elif wordNext in thises and wordNextNext == "nit": remainder = "pm" used = 2 else: if timeQualifier != "": military = True if strHH <= 12 and \ (timeQualifier == "matí" or timeQualifier == "tarda"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if (wordNext == "pm" or wordNext == "p.m." or wordNext == "tarda" or wordNext == "vespre"): strHH = strNum remainder = "pm" used = 1 elif (wordNext == "am" or wordNext == "a.m." or wordNext == "matí"): strHH = strNum remainder = "am" used = 1 elif (int(word) > 100 and ( wordPrev == "o" or wordPrev == "oh" or wordPrev == "zero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif ( wordNext == "hora" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minut": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "segon": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif wordNext == "" or ( wordNext == "en" and wordNextNext == "punt"): strHH = word strMM = 00 if wordNext == "en" and wordNextNext == "punt": used += 2 if (wordNextNextNext == "tarda" or wordNextNextNext == "vespre"): remainder = "pm" used += 1 elif wordNextNextNext == "matí": remainder = "am" used += 1 elif wordNextNextNext == "nit": if 0 > int(strHH) > 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext military = True used += 1 if wordNextNext == "hora": used += 1 else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if (remainder == "pm" and 0 < strHH < 12) else strHH strHH = strHH - 12 if (remainder == "am" and 0 < strHH >= 12) else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "en" or wordPrev == "punt": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: words[idx - 1] = "" if idx > 1 and wordPrevPrev in time_indicators: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found: return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=extractedDate.tzinfo) if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if (hrAbs or 0) != -1 and (minAbs or 0) != -1: if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) resultStr = _ca_pruning(resultStr) return [extractedDate, resultStr] def _ca_pruning(text, symbols=True, accents=False, agressive=True): # agressive ca word pruning words = ["l", "la", "el", "els", "les", "de", "dels", "ell", "ells", "me", "és", "som", "al", "a", "dins", "per", "aquest", "aquesta", "això", "aixina", "en", "aquell", "aquella", "va", "vam", "vaig", "quin", "quina"] if symbols: symbols = [".", ",", ";", ":", "!", "?", "¡", "¿"] for symbol in symbols: text = text.replace(symbol, "") text = text.replace("'", " ").replace("_", " ") # accents=False if accents: accents = {"a": ["á", "à", "ã", "â"], "e": ["ê", "è", "é"], "i": ["í", "ï"], "o": ["ò", "ó"], "u": ["ú", "ü"], "c": ["ç"], "ll": ["l·l"], "n": ["ñ"]} for char in accents: for acc in accents[char]: text = text.replace(acc, char) if agressive: text_words = text.split(" ") for idx, word in enumerate(text_words): if word in words: text_words[idx] = "" text = " ".join(text_words) text = ' '.join(text.split()) return text def get_gender_ca(word, context=""): """ Guess the gender of a word Some languages assign genders to specific words. This method will attempt to determine the gender, optionally using the provided context sentence. Args: word (str): The word to look up context (str, optional): String containing word, for context Returns: str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, or None if unknown/or unused in the given language. """ # parse gender taking context into account word = word.lower() words = context.lower().split(" ") for idx, w in enumerate(words): if w == word and idx != 0: # in Catalan usually the previous word (a determinant) # assigns gender to the next word previous = words[idx - 1].lower() if previous in _MALE_DETERMINANTS_CA: return "m" elif previous in _FEMALE_DETERMINANTS_CA: return "f" # get gender using only the individual word # see if this word has the gender defined if word in _GENDERS_CA: return _GENDERS_CA[word] singular = word.rstrip("s") if singular in _GENDERS_CA: return _GENDERS_CA[singular] # in Catalan the last vowel usually dosn't defines the gender of a word # the gender of the determinant takes precedence over this rule for end_str in _FEMALE_ENDINGS_CA: if word.endswith(end_str): return "f" for end_str in _MALE_ENDINGS_CA: if word.endswith(end_str): return "m" return None lingua-franca-release-v0.4.3/lingua_franca/lang/parse_common.py000066400000000000000000000301771426211343400246070ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from collections import namedtuple import re class Normalizer: """ individual languages may subclass this if needed normalize_XX should pass a valid config read from json """ _default_config = {} def __init__(self, config=None): self.config = config or self._default_config @staticmethod def tokenize(utterance): # Split things like 12% utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) # Split thins like #1 utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) return utterance.split() @property def should_lowercase(self): return self.config.get("lowercase", False) @property def should_numbers_to_digits(self): return self.config.get("numbers_to_digits", True) @property def should_expand_contractions(self): return self.config.get("expand_contractions", True) @property def should_remove_symbols(self): return self.config.get("remove_symbols", False) @property def should_remove_accents(self): return self.config.get("remove_accents", False) @property def should_remove_articles(self): return self.config.get("remove_articles", False) @property def should_remove_stopwords(self): return self.config.get("remove_stopwords", False) @property def contractions(self): return self.config.get("contractions", {}) @property def word_replacements(self): return self.config.get("word_replacements", {}) @property def number_replacements(self): return self.config.get("number_replacements", {}) @property def accents(self): return self.config.get("accents", {"á": "a", "à": "a", "ã": "a", "â": "a", "é": "e", "è": "e", "ê": "e", "ẽ": "e", "í": "i", "ì": "i", "î": "i", "ĩ": "i", "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ú": "u", "ù": "u", "û": "u", "ũ": "u", "Á": "A", "À": "A", "Ã": "A", "Â": "A", "É": "E", "È": "E", "Ê": "E", "Ẽ": "E", "Í": "I", "Ì": "I", "Î": "I", "Ĩ": "I", "Ò": "O", "Ó": "O", "Ô": "O", "Õ": "O", "Ú": "U", "Ù": "U", "Û": "U", "Ũ": "U" }) @property def stopwords(self): return self.config.get("stopwords", []) @property def articles(self): return self.config.get("articles", []) @property def symbols(self): return self.config.get("symbols", [";", "_", "!", "?", "<", ">", "|", "(", ")", "=", "[", "]", "{", "}", "»", "«", "*", "~", "^", "`"]) def expand_contractions(self, utterance): """ Expand common contractions, e.g. "isn't" -> "is not" """ words = self.tokenize(utterance) for idx, w in enumerate(words): if w in self.contractions: words[idx] = self.contractions[w] utterance = " ".join(words) return utterance def numbers_to_digits(self, utterance): words = self.tokenize(utterance) for idx, w in enumerate(words): if w in self.number_replacements: words[idx] = self.number_replacements[w] utterance = " ".join(words) return utterance def remove_articles(self, utterance): words = self.tokenize(utterance) for idx, w in enumerate(words): if w in self.articles: words[idx] = "" utterance = " ".join(words) return utterance def remove_stopwords(self, utterance): words = self.tokenize(utterance) for idx, w in enumerate(words): if w in self.stopwords: words[idx] = "" # if words[-1] == '-': # words = words[:-1] utterance = " ".join(words) # Remove trailing whitespaces from utterance along with orphaned # hyphens, more characters may be added later utterance = re.sub(r'- *$', '', utterance) return utterance def remove_symbols(self, utterance): for s in self.symbols: utterance = utterance.replace(s, " ") return utterance def remove_accents(self, utterance): for s in self.accents: utterance = utterance.replace(s, self.accents[s]) return utterance def replace_words(self, utterance): words = self.tokenize(utterance) for idx, w in enumerate(words): if w in self.word_replacements: words[idx] = self.word_replacements[w] utterance = " ".join(words) return utterance def normalize(self, utterance="", remove_articles=None): # mutations if self.should_lowercase: utterance = utterance.lower() if self.should_expand_contractions: utterance = self.expand_contractions(utterance) if self.should_numbers_to_digits: utterance = self.numbers_to_digits(utterance) utterance = self.replace_words(utterance) # removals if self.should_remove_symbols: utterance = self.remove_symbols(utterance) if self.should_remove_accents: utterance = self.remove_accents(utterance) # TODO deprecate remove_articles param, backwards compat if remove_articles is not None and remove_articles: utterance = self.remove_articles(utterance) elif self.should_remove_articles: utterance = self.remove_articles(utterance) if self.should_remove_stopwords: utterance = self.remove_stopwords(utterance) # remove extra spaces utterance = " ".join([w for w in utterance.split(" ") if w]) return utterance # Token is intended to be used in the number processing functions in # this module. The parsing requires slicing and dividing of the original # text. To ensure things parse correctly, we need to know where text came # from in the original input, hence this nametuple. Token = namedtuple('Token', 'word index') class ReplaceableNumber: """ Similar to Token, this class is used in number parsing. Once we've found a number in a string, this class contains all the info about the value, and where it came from in the original text. In other words, it is the text, and the number that can replace it in the string. """ def __init__(self, value, tokens: [Token]): self.value = value self.tokens = tokens def __bool__(self): return bool(self.value is not None and self.value is not False) @property def start_index(self): return self.tokens[0].index @property def end_index(self): return self.tokens[-1].index @property def text(self): return ' '.join([t.word for t in self.tokens]) def __setattr__(self, key, value): try: getattr(self, key) except AttributeError: super().__setattr__(key, value) else: raise Exception("Immutable!") def __str__(self): return "({v}, {t})".format(v=self.value, t=self.tokens) def __repr__(self): return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value, t=self.tokens) def tokenize(text): """ Generate a list of token object, given a string. Args: text str: Text to tokenize. Returns: [Token] """ return [Token(word, index) for index, word in enumerate(Normalizer.tokenize(text))] def partition_list(items, split_on): """ Partition a list of items. Works similarly to str.partition Args: items: split_on callable: Should return a boolean. Each item will be passed to this callable in succession, and partitions will be created any time it returns True. Returns: [[any]] """ splits = [] current_split = [] for item in items: if split_on(item): splits.append(current_split) splits.append([item]) current_split = [] else: current_split.append(item) splits.append(current_split) return list(filter(lambda x: len(x) != 0, splits)) def invert_dict(original): """ Produce a dictionary with the keys and values inverted, relative to the dict passed in. Args: original dict: The dict like object to invert Returns: dict """ return {value: key for key, value in original.items()} def is_numeric(input_str): """ Takes in a string and tests to see if it is a number. Args: text (str): string to test if a number Returns: (bool): True if a number, else False """ try: float(input_str) return True except ValueError: return False def look_for_fractions(split_list): """" This function takes a list made by fraction & determines if a fraction. Args: split_list (list): list created by splitting on '/' Returns: (bool): False if not a fraction, otherwise True """ if len(split_list) == 2: if is_numeric(split_list[0]) and is_numeric(split_list[1]): return True return False def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Language agnostic, per language parsers need to be provided Args: text (str): the string to extract a number from pronounce_handler (function): function that pronounces a number extract_handler (function): function that extracts the last number present in a string short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ numbers = [] normalized = text extract = extract_handler(normalized, short_scale, ordinals) to_parse = normalized while extract: numbers.append(extract) prev = to_parse num_txt = pronounce_handler(extract) extract = str(extract) if extract.endswith(".0"): extract = extract[:-2] # handle duplicate occurences, replace last one only def replace_right(source, target, replacement, replacements=None): return replacement.join(source.rsplit(target, replacements)) normalized = replace_right(normalized, num_txt, extract, 1) # last biggest number was replaced, recurse to handle cases like # test one two 3 to_parse = replace_right(to_parse, num_txt, extract, 1) to_parse = replace_right(to_parse, extract, " ", 1) if to_parse == prev: # avoid infinite loops, occasionally pronounced number may be # different from extracted text, # ie pronounce(0.5) != half and extract(half) == 0.5 extract = False # TODO fix this else: extract = extract_handler(to_parse, short_scale, ordinals) numbers.reverse() return numbers lingua-franca-release-v0.4.3/lingua_franca/lang/parse_cs.py000066400000000000000000001750031426211343400237220ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.lang.common_data_cs import _NUM_STRING_CS, \ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS import re import json from lingua_franca import resolve_resource_file from lingua_franca.time import now_local def generate_plurals_cs(originals): """ Return a new set or dict containing the plural form of the original values, In English this means all with 's' appended to them. Args: originals set(str) or dict(str, any): values to pluralize Returns: set(str) or dict(str, any) """ if isinstance(originals, dict): return {key + 'ý': value for key, value in originals.items()} return {value + "ý" for value in originals} # negate next number (-2 = 0 - 2) _NEGATIVES = {"záporné", "mínus"} # sum the next number (twenty two = 20 + 2) _SUMS = {'dvacet', '20', 'třicet', '30', 'čtyřicet', '40', 'padesát', '50', 'šedesát', '60', 'sedmdesát', '70', 'osmdesát', '80', 'devadesát', '90'} _MULTIPLIES_LONG_SCALE_CS = set(_LONG_SCALE_CS.values()) | \ generate_plurals_cs(_LONG_SCALE_CS.values()) _MULTIPLIES_SHORT_SCALE_CS = set(_SHORT_SCALE_CS.values()) | \ generate_plurals_cs(_SHORT_SCALE_CS.values()) # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) _FRACTION_MARKER = {"a"} # decimal marker ( 1 point 5 = 1 + 0.5) _DECIMAL_MARKER = {"bod", "tečka", "čárka", "celá"} _STRING_NUM_CS = invert_dict(_NUM_STRING_CS) _STRING_NUM_CS.update(generate_plurals_cs(_STRING_NUM_CS)) _STRING_NUM_CS.update({ "polovina": 0.5, "půlka": 0.5, "půl": 0.5, "jeden": 1, "dvojice": 2, "dvoje": 2 }) _STRING_SHORT_ORDINAL_CS = invert_dict(_SHORT_ORDINAL_CS) _STRING_LONG_ORDINAL_CS = invert_dict(_LONG_ORDINAL_CS) def _convert_words_to_numbers_cs(text, short_scale=True, ordinals=False): """ Convert words in a string into their equivalent numbers. Args: text str: short_scale boolean: True if short scale numbers should be used. ordinals boolean: True if ordinals (e.g. first, second, third) should be parsed to their number values (1, 2, 3...) Returns: str The original text, with numbers subbed in where appropriate. """ text = text.lower() tokens = tokenize(text) numbers_to_replace = \ _extract_numbers_with_text_cs(tokens, short_scale, ordinals) numbers_to_replace.sort(key=lambda number: number.start_index) results = [] for token in tokens: if not numbers_to_replace or \ token.index < numbers_to_replace[0].start_index: results.append(token.word) else: if numbers_to_replace and \ token.index == numbers_to_replace[0].start_index: results.append(str(numbers_to_replace[0].value)) if numbers_to_replace and \ token.index == numbers_to_replace[0].end_index: numbers_to_replace.pop(0) return ' '.join(results) def _extract_numbers_with_text_cs(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Extract all numbers from a list of Tokens, with the words that represent them. Args: [Token]: The tokens to parse. short_scale bool: True if short scale numbers should be used, False for long scale. True by default. ordinals bool: True if ordinal words (first, second, third, etc) should be parsed. fractional_numbers bool: True if we should look for fractions and decimals. Returns: [ReplaceableNumber]: A list of tuples, each containing a number and a string. """ placeholder = "" # inserted to maintain correct indices results = [] while True: to_replace = \ _extract_number_with_text_cs(tokens, short_scale, ordinals, fractional_numbers) if not to_replace: break results.append(to_replace) tokens = [ t if not to_replace.start_index <= t.index <= to_replace.end_index else Token(placeholder, t.index) for t in tokens ] results.sort(key=lambda n: n.start_index) return results def _extract_number_with_text_cs(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ This function extracts a number from a list of Tokens. Args: tokens str: the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and decimals. Returns: ReplaceableNumber """ number, tokens = \ _extract_number_with_text_cs_helper(tokens, short_scale, ordinals, fractional_numbers) # while tokens and tokens[0].word in _ARTICLES_CS: # tokens.pop(0) return ReplaceableNumber(number, tokens) def _extract_number_with_text_cs_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Helper for _extract_number_with_text_en. This contains the real logic for parsing, but produces a result that needs a little cleaning (specific, it may contain leading articles that can be trimmed off). Args: tokens [Token]: short_scale boolean: ordinals boolean: fractional_numbers boolean: Returns: int or float, [Tokens] """ if fractional_numbers: fraction, fraction_text = \ _extract_fraction_with_text_cs(tokens, short_scale, ordinals) if fraction: return fraction, fraction_text decimal, decimal_text = \ _extract_decimal_with_text_cs(tokens, short_scale, ordinals) if decimal: return decimal, decimal_text return _extract_whole_number_with_text_cs(tokens, short_scale, ordinals) def _extract_fraction_with_text_cs(tokens, short_scale, ordinals): """ Extract fraction numbers from a string. This function handles text such as '2 and 3/4'. Note that "one half" or similar will be parsed by the whole number function. Args: tokens [Token]: words and their indexes in the original string. short_scale boolean: ordinals boolean: Returns: (int or float, [Token]) The value found, and the list of relevant tokens. (None, None) if no fraction value is found. """ for c in _FRACTION_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_cs(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_cs(partitions[2], short_scale, ordinals, fractional_numbers=True) if not numbers1 or not numbers2: return None, None # ensure first is not a fraction and second is a fraction num1 = numbers1[-1] num2 = numbers2[0] if num1.value >= 1 and 0 < num2.value < 1: return num1.value + num2.value, \ num1.tokens + partitions[1] + num2.tokens return None, None def _extract_decimal_with_text_cs(tokens, short_scale, ordinals): """ Extract decimal numbers from a string. This function handles text such as '2 point 5'. Notes: While this is a helper for extract_number_xx, it also depends on extract_number_xx, to parse out the components of the decimal. This does not currently handle things like: number dot number number number Args: tokens [Token]: The text to parse. short_scale boolean: ordinals boolean: Returns: (float, [Token]) The value found and relevant tokens. (None, None) if no decimal value is found. """ for c in _DECIMAL_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_cs(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_cs(partitions[2], short_scale, ordinals, fractional_numbers=False) if not numbers1 or not numbers2: return None, None number = numbers1[-1] decimal = numbers2[0] # TODO handle number dot number number number if "." not in str(decimal.text): return number.value + float('0.' + str(decimal.value)), \ number.tokens + partitions[1] + decimal.tokens return None, None def _extract_whole_number_with_text_cs(tokens, short_scale, ordinals): """ Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [Token]: short_scale boolean: ordinals boolean: Returns: int or float, [Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data(short_scale) number_words = [] # type: [Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word # if word in _ARTICLES_CS or word in _NEGATIVES: if word in word in _NEGATIVES: number_words.append(token) continue prev_word = tokens[idx - 1].word if idx > 0 else "" next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" # In czech we do no use suffix (1st,2nd,..) but use point instead (1.,2.,..) if is_numeric(word[:-1]) and \ (word.endswith(".")): # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth word = word[:-1] # handle nth one # if next_word == "one": # would return 1 instead otherwise # tokens[idx + 1] = Token("", idx) # next_word = "" # Normalize Czech inflection of numbers(jedna,jeden,jedno,...) if not ordinals: word = _text_cs_inflection_normalize(word, 1) if word not in string_num_scale and \ word not in _STRING_NUM_CS and \ word not in _SUMS and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not isFractional_cs(word, short_scale=short_scale) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] # if number_words and not all([w in _ARTICLES_CS | # _NEGATIVES for w in words_only]): if number_words and not all([w in _NEGATIVES for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SUMS \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES: # \ # and prev_word not in _ARTICLES_CS: number_words = [token] elif prev_word in _SUMS and word in _SUMS: number_words = [token] else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_CS: val = _STRING_NUM_CS.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal and val == 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if (prev_word in _SUMS and val and val < 10) or all([prev_word in multiplies, val < prev_val if prev_val else False]): val = prev_val + val # For Czech only: If Ordinal previous number will be also in ordinal number format # dvacátý první = twentieth first if (prev_word in string_num_ordinal and val and val < 10) or all([prev_word in multiplies, val < prev_val if prev_val else False]): val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = isFractional_cs(word, short_scale=short_scale) current_val = val # 2 fifths if not ordinals: next_val = isFractional_cs(next_word, short_scale=short_scale) if next_val: if not val: val = 1 val = val * next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) current_val = val else: if all([ prev_word in _SUMS, word not in _SUMS, word not in multiplies, current_val >= 10]): # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val if word in multiplies and next_word not in multiplies: # handle long numbers # six hundred sixty six # two million five hundred thousand # # This logic is somewhat complex, and warrants # extensive documentation for the next coder's sake. # # The current word is a power of ten. `current_val` is # its integer value. `val` is our working sum # (above, when `current_val` is 1 million, `val` is # 2 million.) # # We have a dict `string_num_scale` containing [value, word] # pairs for "all" powers of ten: string_num_scale[10] == "ten. # # We need go over the rest of the tokens, looking for other # powers of ten. If we find one, we compare it with the current # value, to see if it's smaller than the current power of ten. # # Numbers which are not powers of ten will be passed over. # # If all the remaining powers of ten are smaller than our # current value, we can set the current value aside for later, # and begin extracting another portion of our final result. # For example, suppose we have the following string. # The current word is "million".`val` is 9000000. # `current_val` is 1000000. # # "nine **million** nine *hundred* seven **thousand** # six *hundred* fifty seven" # # Iterating over the rest of the string, the current # value is larger than all remaining powers of ten. # # The if statement passes, and nine million (9000000) # is appended to `to_sum`. # # The main variables are reset, and the main loop begins # assembling another number, which will also be appended # under the same conditions. # # By the end of the main loop, to_sum will be a list of each # "place" from 100 up: [9000000, 907000, 600] # # The final three digits will be added to the sum of that list # at the end of the main loop, to produce the extracted number: # # sum([9000000, 907000, 600]) + 57 # == 9,000,000 + 907,000 + 600 + 57 # == 9,907,657 # # >>> foo = "nine million nine hundred seven thousand six # hundred fifty seven" # >>> extract_number(foo) # 9907657 time_to_sum = True for other_token in tokens[idx+1:]: if other_token.word in multiplies: if string_num_scale[other_token.word] >= current_val: time_to_sum = False else: continue if not time_to_sum: break if time_to_sum: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words def _initialize_number_data(short_scale): """ Generate dictionaries of words to numbers, based on scale. This is a helper function for _extract_whole_number. Args: short_scale boolean: Returns: (set(str), dict(str, number), dict(str, number)) multiplies, string_num_ordinal, string_num_scale """ multiplies = _MULTIPLIES_SHORT_SCALE_CS if short_scale \ else _MULTIPLIES_LONG_SCALE_CS string_num_ordinal_cs = _STRING_SHORT_ORDINAL_CS if short_scale \ else _STRING_LONG_ORDINAL_CS string_num_scale_cs = _SHORT_SCALE_CS if short_scale else _LONG_SCALE_CS string_num_scale_cs = invert_dict(string_num_scale_cs) string_num_scale_cs.update(generate_plurals_cs(string_num_scale_cs)) return multiplies, string_num_ordinal_cs, string_num_scale_cs def extract_number_cs(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value def extract_duration_cs(text): """ Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None # Czech inflection for time: minuta,minuty,minut - safe to use minut as pattern # For day: den, dny, dnů - short patern not applicable, list all time_units = { 'microseconds': 0, 'milliseconds': 0, 'seconds': 0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0 } pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ay]?" text = _convert_words_to_numbers_cs(text) for (unit_cs, unit_en) in _TIME_UNITS_CONVERSION.items(): unit_pattern = pattern.format(unit=unit_cs) def repl(match): time_units[unit_en] += float(match.group(1)) return '' text = re.sub(unit_pattern, repl, text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def extract_datetime_cs(text, anchorDate=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: text (str): string containing date words anchorDate (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ def clean_string(s): # clean unneeded punctuation and capitalization among other things. # Normalize czech inflection s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace("dvoje", "2").replace("dvojice", "2") \ .replace("dnes večer", "večer").replace("dnes v noci", "noci") # \ # .replace("tento večer", "večer") # .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ # .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ # .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ # .replace("decades", "decade") \ # .replace("tisíciletí", "milénium") # .replace("oclock", "o'clock") wordList = s.split() for idx, word in enumerate(wordList): #word = word.replace("'s", "") ########## # Czech Day Ordinals - we do not use 1st,2nd format # instead we use full ordinal number names with specific format(suffix) # Example: třicátého prvního > 31 count_ordinals = 0 if word == "prvního": count_ordinals = 1 # These two have different format elif word == "třetího": count_ordinals = 3 elif word.endswith("ého"): tmp = word[:-3] tmp += ("ý") for nr, name in _ORDINAL_BASE_CS.items(): if name == tmp: count_ordinals = nr # If number is bigger than 19 chceck if next word is also ordinal # and count them together if count_ordinals > 19: if wordList[idx+1] == "prvního": count_ordinals += 1 # These two have different format elif wordList[idx+1] == "třetího": count_ordinals += 3 elif wordList[idx+1].endswith("ého"): tmp = wordList[idx+1][:-3] tmp += ("ý") for nr, name in _ORDINAL_BASE_CS.items(): if name == tmp and nr < 10: # write only if sum makes acceptable count of days in month if (count_ordinals + nr) <= 31: count_ordinals += nr if count_ordinals > 0: word = str(count_ordinals) # Write normalized valu into word if count_ordinals > 20: # If counted number is grather than 20, clear next word so it is not used again wordList[idx+1] = "" ########## # Remove inflection from czech months wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 today = anchorDate.strftime("%w") currentYear = anchorDate.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersAM = ['ráno', 'dopoledne'] timeQualifiersPM = ['odpoledne', 'večer', 'noc', 'noci'] timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) markers = ['na', 'v', 'do', 'na', 'tento', 'okolo', 'toto', 'během', 'za', 'této'] days = ['pondělí', 'úterý', 'středa', 'čtvrtek', 'pátek', 'sobota', 'neděle'] months = _MONTHS_CZECH recur_markers = days + [d + 'ho' for d in days] + \ ['víkend', 'všední'] # Check this monthsShort = ['led', 'úno', 'bře', 'dub', 'kvě', 'čvn', 'čvc', 'srp', 'zář', 'říj', 'lis', 'pro'] year_multiples = ["desetiletí", "století", "tisíciletí"] day_multiples = ["týden", "měsíc", "rok"] words = clean_string(text) for idx, word in enumerate(words): if word == "": continue word = _text_cs_inflection_normalize(word, 2) wordPrevPrev = _text_cs_inflection_normalize( words[idx - 2], 2) if idx > 1 else "" wordPrev = _text_cs_inflection_normalize( words[idx - 1], 2) if idx > 0 else "" wordNext = _text_cs_inflection_normalize( words[idx + 1], 2) if idx + 1 < len(words) else "" wordNextNext = _text_cs_inflection_normalize( words[idx + 2], 2) if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words #word = word.rstrip('s') start = idx used = 0 # save timequalifier for later # if word == "před" and dayOffset: # dayOffset = - dayOffset # used += 1 if word == "nyní" and not datestr: resultStr = " ".join(words[idx + 1:]) resultStr = ' '.join(resultStr.split()) extractedDate = anchorDate.replace(microsecond=0) return [extractedDate, resultStr] elif wordNext in year_multiples: multiplier = None if is_numeric(word): multiplier = extract_number_cs(word) multiplier = multiplier or 1 multiplier = int(multiplier) used += 2 if wordNext == "desetiletí": yearOffset = multiplier * 10 elif wordNext == "století": yearOffset = multiplier * 100 elif wordNext == "tisíciletí": yearOffset = multiplier * 1000 # couple of elif word == "2" and wordNext == "krát" and \ wordNextNext in year_multiples: multiplier = 2 used += 3 if wordNextNext == "desetiletí": yearOffset = multiplier * 10 elif wordNextNext == "století": yearOffset = multiplier * 100 elif wordNextNext == "tisíciletí": yearOffset = multiplier * 1000 elif word == "2" and wordNext == "krát" and \ wordNextNext in day_multiples: multiplier = 2 used += 3 if wordNextNext == "rok": yearOffset = multiplier elif wordNextNext == "měsíc": monthOffset = multiplier elif wordNextNext == "týden": dayOffset = multiplier * 7 elif word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "dnes" and not fromFlag: dayOffset = 0 used += 1 elif word == "zítra" and not fromFlag: dayOffset = 1 used += 1 elif word == "den" and wordNext == "před" and wordNextNext == "včera" and not fromFlag: dayOffset = -2 used += 3 elif word == "před" and wordNext == "včera" and not fromFlag: dayOffset = -2 used += 2 elif word == "včera" and not fromFlag: dayOffset = -1 used += 1 elif (word == "den" and wordNext == "po" and wordNextNext == "zítra" and not fromFlag and (not wordPrev or not wordPrev[0].isdigit())): dayOffset = 2 used = 3 if wordPrev == "ten": start -= 1 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "den": if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 if wordPrevPrev == "před": dayOffset = -dayOffset used += 1 start -= 1 elif word == "týden" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "další" or wordPrev == "příští": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "poslední": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "měsíc" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "další" or wordPrev == "příští": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "poslední": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "rok" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "další" or wordPrev == "příští": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "poslední": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "další" or wordPrev == "příští": if dayOffset <= 2: dayOffset += 7 used += 1 start -= 1 elif wordPrev == "poslední": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 # Convert czech months to english datestr = _MONTHS_CONVERSION.get(m) if wordPrev and (wordPrev[0].isdigit() or (wordPrev == " " and wordPrevPrev[0].isdigit())): if wordPrev == " " and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # if no date indicators found, it may not be the month of May # may "i/we" ... # "... may be" # elif word == 'may' and wordNext in ['i', 'we', 'be']: # datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("dnes") validFollowups.append("zítra") validFollowups.append("včera") validFollowups.append("další") validFollowups.append("příští") validFollowups.append("poslední") validFollowups.append("teď") validFollowups.append("toto") validFollowups.append("této") validFollowups.append("tento") if (word == "od" or word == "po" or word == "do") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "zítra": dayOffset += 1 elif wordNext == "včera": dayOffset -= 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "další" or wordPrev == "příští": if dayOffset <= 2: tmpOffset += 7 used += 1 start -= 1 elif wordNext == "poslední": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and (words[start - 1] == "toto" or words[start - 1] == "této" or words[start - 1] == "tento"): start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue word = _text_cs_inflection_normalize(word, 2) wordPrevPrev = _text_cs_inflection_normalize( words[idx - 2], 2) if idx > 1 else "" wordPrev = _text_cs_inflection_normalize( words[idx - 1], 2) if idx > 0 else "" wordNext = _text_cs_inflection_normalize( words[idx + 1], 2) if idx + 1 < len(words) else "" wordNextNext = _text_cs_inflection_normalize( words[idx + 2], 2) if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "poledne": hrAbs = 12 used += 1 elif word == "půlnoc": hrAbs = 0 used += 1 elif word == "ráno": if hrAbs is None: hrAbs = 8 used += 1 elif word == "odpoledne": if hrAbs is None: hrAbs = 15 used += 1 elif word == "večer": if hrAbs is None: hrAbs = 19 used += 1 if (wordNext != "" and wordNext[0].isdigit() and ":" in wordNext): used -= 1 elif word == "noci" or word == "noc": if hrAbs is None: hrAbs = 22 #used += 1 # if ((wordNext !='' and not wordNext[0].isdigit()) or wordNext =='') and \ # ((wordNextNext !='' and not wordNextNext[0].isdigit())or wordNextNext =='') : # used += 1 # used += 1 ## NOTE this breaks other tests, TODO refactor me! # couple of time_unit elif word == "2" and wordNext == "krát" and \ wordNextNext in ["hodin", "minut", "sekund"]: used += 3 if wordNextNext == "hodin": hrOffset = 2 elif wordNextNext == "minut": minOffset = 2 elif wordNextNext == "sekund": secOffset = 2 # parse half an hour, quarter hour elif word == "hodin" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev == "půl": minOffset = 30 elif wordPrev == "čtvrt": minOffset = 15 elif wordPrevPrev == "třičtvrtě": minOffset = 15 if idx > 2 and words[idx - 3] in markers: words[idx - 3] = "" words[idx - 2] = "" elif wordPrev == "během": hrOffset = 1 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "tato" or wordPrevPrev == "této": daySpecified = True words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc # parse in a minute elif word == "minut" and wordPrev == "za": minOffset = 1 words[idx - 1] = "" used += 1 # parse in a second elif word == "sekund" and wordPrev == "za": secOffset = 1 words[idx - 1] = "" used += 1 elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if wordNext == "večer" or wordNext == "noci" or wordNextNext == "večer" \ or wordNextNext == "noci" or wordPrev == "večer" \ or wordPrev == "noci" or wordPrevPrev == "večer" \ or wordPrevPrev == "noci" or wordNextNextNext == "večer" \ or wordNextNextNext == "noci": remainder = "pm" used += 1 if wordPrev == "večer" or wordPrev == "noci": words[idx - 1] = "" if wordPrevPrev == "večer" or wordPrevPrev == "noci": words[idx - 2] = "" if wordNextNext == "večer" or wordNextNext == "noci": used += 1 if wordNextNextNext == "večer" or wordNextNextNext == "noci": used += 1 if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 # elif wordNext == "in" and wordNextNext == "the" and \ # words[idx + 3] == "ráno": # remainder = "am" # used += 3 # elif wordNext == "in" and wordNextNext == "the" and \ # words[idx + 3] == "odpoledne": # remainder = "pm" # used += 3 # elif wordNext == "in" and wordNextNext == "the" and \ # words[idx + 3] == "večer": # remainder = "pm" # used += 3 elif wordNext == "ráno": remainder = "am" used += 2 elif wordNext == "odpoledne": remainder = "pm" used += 2 elif wordNext == "večer": remainder = "pm" used += 2 elif wordNext == "toto" and wordNextNext == "ráno": remainder = "am" used = 2 daySpecified = True elif wordNext == "na" and wordNextNext == "odpoledne": remainder = "pm" used = 2 daySpecified = True elif wordNext == "na" and wordNextNext == "večer": remainder = "pm" used = 2 daySpecified = True elif wordNext == "v" and wordNextNext == "noci": if strHH and int(strHH) > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": military = True if strHH and int(strHH) <= 12 and \ (timeQualifier in timeQualifiersPM): strHH += str(int(strHH) + 12) else: # try to parse numbers without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 elif ( remainder in recur_markers or wordNext in recur_markers or wordNextNext in recur_markers): # Ex: "7 on mondays" or "3 this friday" # Set strHH so that isTime == True # when am or pm is not specified strHH = strNum used = 1 else: if (int(strNum) > 100): # and #Check this # ( # wordPrev == "o" or # wordPrev == "oh" # )): # 0800 hours (pronounced oh-eight-hundred) strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if wordNext == "hodin": used += 1 elif ( (wordNext == "hodin" or remainder == "hodin") and word[0] != '0' and # (wordPrev != "v" and wordPrev != "na") wordPrev == "za" and ( int(strNum) < 100 or int(strNum) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minut" or \ remainder == "minut": # "in 10 minutes" minOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "sekund" \ or remainder == "sekund": # in 5 seconds secOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(strNum) > 100: # military time, eg. "3300 hours" strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if wordNext == "hodin" or \ remainder == "hodin": used += 1 elif wordNext and wordNext[0].isdigit(): # military time, e.g. "04 38 hours" strHH = strNum strMM = wordNext military = True used += 1 if (wordNextNext == "hodin" or remainder == "hodin"): used += 1 elif ( wordNext == "" or wordNext == "hodin" or ( (wordNext == "v" or wordNext == "na") and ( wordNextNext == timeQualifier ) ) or wordNext == 'večer' or wordNextNext == 'večer'): strHH = strNum strMM = "00" if wordNext == "hodin": used += 1 if (wordNext == "v" or wordNext == "na" or wordNextNext == "v" or wordNextNext == "na"): used += (1 if (wordNext == "v" or wordNext == "na") else 2) wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if (wordNextNext and (wordNextNext in timeQualifier or wordNextNextNext in timeQualifier)): if (wordNextNext in timeQualifiersPM or wordNextNextNext in timeQualifiersPM): remainder = "pm" used += 1 if (wordNextNext in timeQualifiersAM or wordNextNextNext in timeQualifiersAM): remainder = "am" used += 1 if timeQualifier != "": if timeQualifier in timeQualifiersPM: remainder = "pm" used += 1 elif timeQualifier in timeQualifiersAM: remainder = "am" used += 1 else: # TODO: Unsure if this is 100% accurate used += 1 military = True elif remainder == "hodin": remainder = "" else: isTime = False HH = int(strHH) if strHH else 0 MM = int(strMM) if strMM else 0 HH = HH + 12 if remainder == "pm" and HH < 12 else HH HH = HH - 12 if remainder == "am" and HH >= 12 else HH if (not military and remainder not in ['am', 'pm', 'hodin', 'minut', 'sekund'] and ((not daySpecified) or 0 <= dayOffset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed if anchorDate.hour < HH or (anchorDate.hour == HH and anchorDate.minute < MM): pass # No modification needed elif anchorDate.hour < HH + 12: HH += 12 else: # has passed, assume the next morning dayOffset += 1 if timeQualifier in timeQualifiersPM and HH < 12: HH += 12 if HH > 24 or MM > 59: isTime = False used = 0 if isTime: hrAbs = HH minAbs = MM used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): if idx + i >= len(words): break words[idx + i] = "" # if wordPrev == "o" or wordPrev == "oh": # words[words.index(wordPrev)] = "" if wordPrev == "brzy": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "pozdě": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if wordPrev == "toto" or wordPrev == "této": daySpecified = True if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "toto" or wordPrev == "této": daySpecified = True idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = anchorDate.replace(microsecond=0) if datestr != "": # date included an explicit date, e.g. "june 5" or "june 2, 2017" try: temp = datetime.strptime(datestr, "%B %d") except ValueError: # Try again, allowing the year temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if not hasYear: temp = temp.replace(year=extractedDate.year, tzinfo=extractedDate.tzinfo) if extractedDate < temp: extractedDate = extractedDate.replace( year=int(currentYear), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hrOffset == 0 and minOffset == 0 and secOffset == 0: extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: # If no time was supplied in the string set the time to default # time if it's available if hrAbs is None and minAbs is None and default_time is not None: hrAbs, minAbs = default_time.hour, default_time.minute else: hrAbs = hrAbs or 0 minAbs = minAbs or 0 extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and anchorDate > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "a" and \ words[idx - 1] == "" and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def isFractional_cs(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('iny', -3): # leading number is bigger than one ( one třetina, two třetiny) # Normalize to format of one (třetiny > třetina) input_str = input_str[:len(input_str) - 1] + "a" fracts = {"celá": 1} # first four numbers have little different format for num in _FRACTION_STRING_CS: # Numbers from 2 to 1 hundret, more is not usualy used in common speech if num > 1: fracts[_FRACTION_STRING_CS[num]] = num if input_str.lower() in fracts: return 1.0 / fracts[input_str.lower()] return False def extract_numbers_cs(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] class CzechNormalizer(Normalizer): with open(resolve_resource_file("text/cs-cz/normalize.json"), encoding='utf8') as f: _default_config = json.load(f) def normalize_cs(text, remove_articles=True): """ Czech string normalization """ return CzechNormalizer().normalize(text, remove_articles) def _text_cs_inflection_normalize(word, arg): """ Czech Inflection normalizer. This try to normalize known inflection. This function is called from multiple places, each one is defined with arg. Args: word [Word] arg [Int] Returns: word [Word] """ if arg == 1: # _extract_whole_number_with_text_cs # Number one (jedna) if len(word) == 5 and word.startswith("jed"): suffix = 'en', 'no', 'ny' if word.endswith(suffix, 3): word = "jedna" # Number two (dva) elif word == "dvě": word = "dva" elif arg == 2: # extract_datetime_cs TODO: This is ugly if word == "hodina": word = "hodin" if word == "hodiny": word = "hodin" if word == "hodinu": word = "hodin" if word == "minuta": word = "minut" if word == "minuty": word = "minut" if word == "minutu": word = "minut" if word == "minutu": word = "minut" if word == "sekunda": word = "sekund" if word == "sekundy": word = "sekund" if word == "sekundu": word = "sekund" if word == "dní": word = "den" if word == "dnů": word = "den" if word == "dny": word = "den" if word == "týdny": word = "týden" if word == "týdnů": word = "týden" if word == "měsíců": word = "měsíc" if word == "měsíce": word = "měsíc" if word == "měsíci": word = "měsíc" if word == "roky": word = "rok" if word == "roků": word = "rok" if word == "let": word = "rok" if word == "včerejšku": word = "včera" if word == "zítřku": word = "zítra" if word == "zítřejší": word = "zítra" if word == "ranní": word = "ráno" if word == "dopolední": word = "dopoledne" if word == "polední": word = "poledne" if word == "odpolední": word = "odpoledne" if word == "večerní": word = "večer" if word == "noční": word = "noc" if word == "víkendech": word = "víkend" if word == "víkendu": word = "víkend" if word == "všedních": word = "všední" if word == "všedním": word = "všední" # Months if word == "únoru": word = "únor" elif word == "červenci": word = "červenec" elif word == "července": word = "červenec" elif word == "listopadu": word = "listopad" elif word == "prosinci": word = "prosinec" elif word.endswith("nu") or word.endswith("na"): tmp = word[:-2] tmp += ("en") for name in _MONTHS_CZECH: if name == tmp: word = name return word lingua-franca-release-v0.4.3/lingua_franca/lang/parse_da.py000066400000000000000000001003531426211343400236750ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic, Normalizer from lingua_franca.lang.common_data_da import _DA_NUMBERS from lingua_franca.lang.format_da import pronounce_number_da from lingua_franca.time import now_local def extract_number_da(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. text = text.lower() aWords = text.split() aWords = [word for word in aWords if word not in ["den", "det"]] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = float(word) elif is_fractional_da(word): val = is_fractional_da(word) elif is_ordinal_da(word): val = is_ordinal_da(word) else: if word in _DA_NUMBERS: val = _DA_NUMBERS[word] if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_da(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "og" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'og': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'og': and_pass = True valPreAnd = val val = False count += 3 continue break return val or False def extract_datetime_da(text, anchorDate=None, default_time=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. 'am' is a preposition, so cannot currently be used for 12 hour date format """ s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' den ', ' ').replace(' det ', ' ').replace(' om ', ' ').replace( ' om ', ' ') \ .replace(' på ', ' ').replace(' om ', ' ') wordList = s.split() for idx, word in enumerate(wordList): if is_ordinal_da(word) is not False: word = str(is_ordinal_da(word)) wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ['tidlig', 'morgen', 'morgenen', 'formidag', 'formiddagen', 'eftermiddag', 'eftermiddagen', 'aften', 'aftenen', 'nat', 'natten'] markers = ['i', 'om', 'på', 'klokken', 'ved'] days = ['mandag', 'tirsdag', 'onsdag', 'torsdag', 'fredag', 'lørdag', 'søndag'] months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', 'juli', 'august', 'september', 'oktober', 'november', 'desember'] monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug', 'sep', 'okt', 'nov', 'des'] validFollowups = days + months + monthsShort validFollowups.append("i dag") validFollowups.append("morgen") validFollowups.append("næste") validFollowups.append("forige") validFollowups.append("nu") words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "dag" and not fromFlag: dayOffset = 0 used += 1 elif word == "morgen" and not fromFlag and wordPrev != "om" and \ wordPrev not in days: # morgen means tomorrow if not "am # Morgen" and not [day of the week] morgen dayOffset = 1 used += 1 elif word == "overmorgen" and not fromFlag: dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "dag" or word == "dage": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "uge" or word == "uger" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev[:6] == "næste": dayOffset = 7 start -= 1 used = 2 elif wordPrev[:5] == "forige": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "måned" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == "næste": monthOffset = 1 start -= 1 used = 2 elif wordPrev[:5] == "forige": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "år" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == " næste": yearOffset = 1 start -= 1 used = 2 elif wordPrev[:6] == "næste": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordNext == "morgen": # morgen means morning if preceded by # the day of the week words[idx + 1] = "tidlig" if wordPrev[:6] == "næste": dayOffset += 7 used += 1 start -= 1 elif wordPrev[:5] == "forige": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and (wordPrev[0].isdigit() or (wordPrev == "of" and wordPrevPrev[0].isdigit())): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July if ( word == "fra" or word == "til" or word == "om") and wordNext \ in validFollowups: used = 2 fromFlag = True if wordNext == "morgenen" and \ wordPrev != "om" and \ wordPrev not in days: # morgen means tomorrow if not "am Morgen" and not # [day of the week] morgen: dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext[:6] == "næste": tmpOffset += 7 used += 1 start -= 1 elif wordNext[:5] == "forige": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1].startswith("denne"): start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word[:6] == "middag": hrAbs = 12 used += 1 elif word[:11] == "midnat": hrAbs = 0 used += 1 elif word == "morgenen" or ( wordPrev == "om" and word == "morgenen") or word == "tidlig": if not hrAbs: hrAbs = 8 used += 1 elif word[:11] == "eftermiddag": if not hrAbs: hrAbs = 15 used += 1 elif word[:5] == "aften": if not hrAbs: hrAbs = 19 used += 1 # parse half an hour, quarter hour elif word == "time" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev[:4] == "halv": minOffset = 30 elif wordPrev == "kvarter": minOffset = 15 elif wordPrev == "trekvarter": minOffset = 45 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif nextWord == "aften": remainder = "pm" used += 1 elif wordNext == "om" and wordNextNext == "morgenen": remainder = "am" used += 2 elif wordNext == "om" and wordNextNext == "eftermiddagen": remainder = "pm" used += 2 elif wordNext == "om" and wordNextNext == "aftenen": remainder = "pm" used += 2 elif wordNext == "morgen": remainder = "am" used += 1 elif wordNext == "eftermiddag": remainder = "pm" used += 1 elif wordNext == "aften": remainder = "pm" used += 1 elif wordNext == "i" and wordNextNext == "morgen": remainder = "am" used = 2 elif wordNext == "i" and wordNextNext == "eftermiddag": remainder = "pm" used = 2 elif wordNext == "i" and wordNextNext == "aften": remainder = "pm" used = 2 elif wordNext == "natten": if strHH > 4: remainder = "pm" else: remainder = "am" used += 1 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "aftenen" or timeQualifier == "eftermiddagen"): strHH += 12 # what happens when strHH is 24? else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if wordNext == "time" and int(word) < 100: # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minut": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "sekund": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "time": strHH = word used += 1 isTime = True if wordNextNext == timeQualifier: strMM = "" if wordNextNext[:11] == "eftermiddag": used += 1 remainder = "pm" elif wordNextNext == "om" and wordNextNextNext == \ "eftermiddagen": used += 2 remainder = "pm" elif wordNextNext[:5] == "aften": used += 1 remainder = "pm" elif wordNextNext == "om" and wordNextNextNext == \ "aftenen": used += 2 remainder = "pm" elif wordNextNext[:6] == "morgen": used += 1 remainder = "am" elif wordNextNext == "om" and wordNextNextNext == \ "morgenen": used += 2 remainder = "am" elif wordNextNext == "natten": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif is_numeric(wordNextNext): strMM = wordNextNext used += 1 if wordNextNextNext == timeQualifier: if wordNextNextNext[:11] == "eftermiddag": used += 1 remainder = "pm" elif wordNextNextNext == "om" and \ wordNextNextNextNext == \ "eftermiddagen": used += 2 remainder = "pm" elif wordNextNextNext[:6] == "natten": used += 1 remainder = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "natten": used += 2 remainder = "pm" elif wordNextNextNext[:7] == "morgenen": used += 1 remainder = "am" elif wordNextNextNext == "om" and \ wordNextNextNextNext == "morgenen": used += 2 remainder = "am" elif wordNextNextNext == "natten": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif wordNext == timeQualifier: strHH = word strMM = 00 isTime = True if wordNext[:10] == "eftermidag": used += 1 remainder = "pm" elif wordNext == "om" and \ wordNextNext == "eftermiddanen": used += 2 remainder = "pm" elif wordNext[:7] == "aftenen": used += 1 remainder = "pm" elif wordNext == "om" and wordNextNext == "aftenen": used += 2 remainder = "pm" elif wordNext[:7] == "morgenen": used += 1 remainder = "am" elif wordNext == "ao" and wordNextNext == "morgenen": used += 2 remainder = "am" elif wordNext == "natten": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" # if timeQualifier != "": # military = True # else: # isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "tidlig": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "sen": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=extractedDate.tzinfo) if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "og" and words[idx - 1] == "" \ and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_da(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.lower().startswith("halv"): return 0.5 if input_str.lower() == "trediedel": return 1.0 / 3 elif input_str.endswith('del'): input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" if input_str.lower() in _DA_NUMBERS: return 1.0 / (_DA_NUMBERS[input_str.lower()]) return False def is_ordinal_da(input_str): """ This function takes the given text and checks if it is an ordinal number. Args: input_str (str): the string to check if ordinal Returns: (bool) or (float): False if not an ordinal, otherwise the number corresponding to the ordinal ordinals for 1, 3, 7 and 8 are irregular only works for ordinals corresponding to the numbers in _DA_NUMBERS """ lowerstr = input_str.lower() if lowerstr.startswith("første"): return 1 if lowerstr.startswith("anden"): return 2 if lowerstr.startswith("tredie"): return 3 if lowerstr.startswith("fjerde"): return 4 if lowerstr.startswith("femte"): return 5 if lowerstr.startswith("sjette"): return 6 if lowerstr.startswith("elfte"): return 1 if lowerstr.startswith("tolvfte"): return 12 if lowerstr[-3:] == "nde": # from 20 suffix is -ste* lowerstr = lowerstr[:-3] if lowerstr in _DA_NUMBERS: return _DA_NUMBERS[lowerstr] if lowerstr[-4:] in ["ende"]: lowerstr = lowerstr[:-4] if lowerstr in _DA_NUMBERS: return _DA_NUMBERS[lowerstr] if lowerstr[-2:] == "te": # below 20 suffix is -te* lowerstr = lowerstr[:-2] if lowerstr in _DA_NUMBERS: return _DA_NUMBERS[lowerstr] return False def normalize_da(text, remove_articles=True): """ German string normalization """ words = text.split() # this also removed extra spaces normalized = "" for word in words: if remove_articles and word in ["den", "det"]: continue # Convert numbers into digits, e.g. "two" -> "2" if word in _DA_NUMBERS: word = str(_DA_NUMBERS[word]) normalized += " " + word return normalized[1:] # strip the initial space def extract_numbers_da(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) class DanishNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_de.py000066400000000000000000001103501426211343400236770ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import re from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic, Normalizer from lingua_franca.lang.common_data_de import _DE_NUMBERS from lingua_franca.lang.format_de import pronounce_number_de from lingua_franca.time import now_local de_numbers = { 'null': 0, 'ein': 1, 'eins': 1, 'eine': 1, 'einer': 1, 'einem': 1, 'einen': 1, 'eines': 1, 'zwei': 2, 'drei': 3, 'vier': 4, 'fünf': 5, 'sechs': 6, 'sieben': 7, 'acht': 8, 'neun': 9, 'zehn': 10, 'elf': 11, 'zwölf': 12, 'dreizehn': 13, 'vierzehn': 14, 'fünfzehn': 15, 'sechzehn': 16, 'siebzehn': 17, 'achtzehn': 18, 'neunzehn': 19, 'zwanzig': 20, 'einundzwanzig': 21, 'zweiundzwanzig': 22, 'dreiundzwanzig': 23, 'vierundzwanzig': 24, 'fünfundzwanzig': 25, 'sechsundzwanzig': 26, 'siebenundzwanzig': 27, 'achtundzwanzig': 28, 'neunundzwanzig': 29, 'dreißig': 30, 'einunddreißig': 31, 'vierzig': 40, 'fünfzig': 50, 'sechzig': 60, 'siebzig': 70, 'achtzig': 80, 'neunzig': 90, 'hundert': 100, 'zweihundert': 200, 'dreihundert': 300, 'vierhundert': 400, 'fünfhundert': 500, 'sechshundert': 600, 'siebenhundert': 700, 'achthundert': 800, 'neunhundert': 900, 'tausend': 1000, 'million': 1000000 } # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. def extract_duration_de(text): """ Convert an german phrase into a number of seconds Convert things like: "10 Minuten" "3 Tage 8 Stunden 10 Minuten und 49 Sekunden" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None text = text.lower() # die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben time_units = { 'microseconds': 'mikrosekunden', 'milliseconds': 'millisekunden', 'seconds': 'sekunden', 'minutes': 'minuten', 'hours': 'stunden', 'days': 'tage', 'weeks': 'wochen' } # Einzahl und Mehrzahl pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" # TODO Einstiegspunkt für Text-zu-Zahlen Konversion #text = _convert_words_to_numbers_de(text) for (unit_en, unit_de) in time_units.items(): unit_pattern = pattern.format( unit=unit_de[:-1]) # remove 'n'/'e' from unit time_units[unit_en] = 0 def repl(match): time_units[unit_en] += float(match.group(1)) return '' text = re.sub(unit_pattern, repl, text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def extract_number_de(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. text = text.lower() aWords = text.split() aWords = [word for word in aWords if word not in ["der", "die", "das", "des", "den", "dem"]] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) elif is_fractional_de(word): val = is_fractional_de(word) elif is_ordinal_de(word): val = is_ordinal_de(word) else: if word in _DE_NUMBERS: val = _DE_NUMBERS[word] if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_de(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "and" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'und': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'und': and_pass = True valPreAnd = val val = False count += 3 continue break return val or False def extract_datetime_de(text, anchorDate=None, default_time=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. 'am' is a preposition, so cannot currently be used for 12 hour date format """ s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' der ', ' ').replace(' den ', ' ').replace(' an ', ' ').replace( ' am ', ' ') \ .replace(' auf ', ' ').replace(' um ', ' ') wordList = s.split() for idx, word in enumerate(wordList): if is_ordinal_de(word) is not False: word = str(is_ordinal_de(word)) wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', 'nachmittag', 'nachmittags', 'abend', 'abends', 'nachts'] markers = ['in', 'am', 'gegen', 'bis', 'für'] days = ['montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag', 'sonntag'] months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', 'juli', 'august', 'september', 'october', 'november', 'dezember'] monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug', 'sept', 'oct', 'nov', 'dez'] validFollowups = days + months + monthsShort validFollowups.append("heute") validFollowups.append("morgen") validFollowups.append("nächste") validFollowups.append("nächster") validFollowups.append("nächstes") validFollowups.append("nächsten") validFollowups.append("nächstem") validFollowups.append("letzte") validFollowups.append("letzter") validFollowups.append("letztes") validFollowups.append("letzten") validFollowups.append("letztem") validFollowups.append("jetzt") words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words if word != 'morgen' and word != 'übermorgen': if word[-2:] == "en": word = word[:-2] # remove en if word != 'heute': if word[-1:] == "e": word = word[:-1] # remove plural for most nouns start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "heute" and not fromFlag: dayOffset = 0 used += 1 elif word == "morgen" and not fromFlag and wordPrev != "am" and \ wordPrev not in days: # morgen means tomorrow if not "am # Morgen" and not [day of the week] morgen dayOffset = 1 used += 1 elif word == "übermorgen" and not fromFlag: dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "tag" or word == "tage": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "woch" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev[:6] == "nächst": dayOffset = 7 start -= 1 used = 2 elif wordPrev[:5] == "letzt": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "monat" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == "nächst": monthOffset = 1 start -= 1 used = 2 elif wordPrev[:5] == "letzt": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "jahr" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == "nächst": yearOffset = 1 start -= 1 used = 2 elif wordPrev[:6] == "nächst": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordNext == "morgen": # morgen means morning if preceded by # the day of the week words[idx + 1] = "früh" if wordPrev[:6] == "nächst": dayOffset += 7 used += 1 start -= 1 elif wordPrev[:5] == "letzt": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and (wordPrev[0].isdigit() or (wordPrev == "of" and wordPrevPrev[0].isdigit())): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July if ( word == "von" or word == "nach" or word == "ab") and wordNext \ in validFollowups: used = 2 fromFlag = True if wordNext == "morgen" and wordPrev != "am" and \ wordPrev not in days: # morgen means tomorrow if not "am # Morgen" and not [day of the week] morgen: dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext[:6] == "nächst": tmpOffset += 7 used += 1 start -= 1 elif wordNext[:5] == "letzt": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1].startswith("diese"): start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word[:6] == "mittag": hrAbs = 12 used += 1 elif word[:11] == "mitternacht": hrAbs = 0 used += 1 elif word == "morgens" or ( wordPrev == "am" and word == "morgen") or word == "früh": if not hrAbs: hrAbs = 8 used += 1 elif word[:10] == "nachmittag": if not hrAbs: hrAbs = 15 used += 1 elif word[:5] == "abend": if not hrAbs: hrAbs = 19 used += 1 # parse half an hour, quarter hour elif word == "stunde" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev[:4] == "halb": minOffset = 30 elif wordPrev == "viertel": minOffset = 15 elif wordPrev == "dreiviertel": minOffset = 45 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif nextWord == "abends": remainder = "pm" used += 1 elif wordNext == "am" and wordNextNext == "morgen": remainder = "am" used += 2 elif wordNext == "am" and wordNextNext == "nachmittag": remainder = "pm" used += 2 elif wordNext == "am" and wordNextNext == "abend": remainder = "pm" used += 2 elif wordNext == "morgens": remainder = "am" used += 1 elif wordNext == "nachmittags": remainder = "pm" used += 1 elif wordNext == "abends": remainder = "pm" used += 1 elif wordNext == "heute" and wordNextNext == "morgen": remainder = "am" used = 2 elif wordNext == "heute" and wordNextNext == "nachmittag": remainder = "pm" used = 2 elif wordNext == "heute" and wordNextNext == "abend": remainder = "pm" used = 2 elif wordNext == "nachts": if strHH > 4: remainder = "pm" else: remainder = "am" used += 1 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "abends" or timeQualifier == "nachmittags"): strHH += 12 # what happens when strHH is 24? else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if wordNext == "stund" and int(word) < 100: # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minut": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "sekund": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "uhr": strHH = word used += 1 isTime = True if wordNextNext == timeQualifier: strMM = "" if wordNextNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "nachmittag": used += 2 remainder = "pm" elif wordNextNext[:5] == "abend": used += 1 remainder = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "abend": used += 2 remainder = "pm" elif wordNextNext[:7] == "morgens": used += 1 remainder = "am" elif wordNextNext == "am" and wordNextNextNext == \ "morgen": used += 2 remainder = "am" elif wordNextNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif is_numeric(wordNextNext): strMM = wordNextNext used += 1 if wordNextNextNext == timeQualifier: if wordNextNextNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "nachmittag": used += 2 remainder = "pm" elif wordNextNextNext[:5] == "abend": used += 1 remainder = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "abend": used += 2 remainder = "pm" elif wordNextNextNext[:7] == "morgens": used += 1 remainder = "am" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "morgen": used += 2 remainder = "am" elif wordNextNextNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif wordNext == timeQualifier: strHH = word strMM = 00 isTime = True if wordNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNext == "am" and wordNextNext == "nachmittag": used += 2 remainder = "pm" elif wordNext[:5] == "abend": used += 1 remainder = "pm" elif wordNext == "am" and wordNextNext == "abend": used += 2 remainder = "pm" elif wordNext[:7] == "morgens": used += 1 remainder = "am" elif wordNext == "am" and wordNextNext == "morgen": used += 2 remainder = "am" elif wordNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" # if timeQualifier != "": # military = True # else: # isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "Uhr": words[words.index(wordPrev)] = "" if wordPrev == "früh": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "spät": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=extractedDate.tzinfo) if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "und" and words[idx - 1] == "" \ and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_de(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.lower().startswith("halb"): return 0.5 if input_str.lower() == "drittel": return 1.0 / 3 elif input_str.endswith('tel'): if input_str.endswith('stel'): input_str = input_str[:len(input_str) - 4] # e.g. "hundertstel" else: input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" if input_str.lower() in _DE_NUMBERS: return 1.0 / (_DE_NUMBERS[input_str.lower()]) return False def is_ordinal_de(input_str): """ This function takes the given text and checks if it is an ordinal number. Args: input_str (str): the string to check if ordinal Returns: (bool) or (float): False if not an ordinal, otherwise the number corresponding to the ordinal ordinals for 1, 3, 7 and 8 are irregular only works for ordinals corresponding to the numbers in _DE_NUMBERS """ lowerstr = input_str.lower() if lowerstr.startswith("erste"): return 1 if lowerstr.startswith("dritte"): return 3 if lowerstr.startswith("siebte"): return 7 if lowerstr.startswith("achte"): return 8 if lowerstr[-3:] == "ste": # from 20 suffix is -ste* lowerstr = lowerstr[:-3] if lowerstr in _DE_NUMBERS: return _DE_NUMBERS[lowerstr] if lowerstr[-4:] in ["ster", "stes", "sten", "stem"]: lowerstr = lowerstr[:-4] if lowerstr in _DE_NUMBERS: return _DE_NUMBERS[lowerstr] if lowerstr[-2:] == "te": # below 20 suffix is -te* lowerstr = lowerstr[:-2] if lowerstr in _DE_NUMBERS: return _DE_NUMBERS[lowerstr] if lowerstr[-3:] in ["ter", "tes", "ten", "tem"]: lowerstr = lowerstr[:-3] if lowerstr in _DE_NUMBERS: return _DE_NUMBERS[lowerstr] return False def normalize_de(text, remove_articles=True): """ German string normalization """ # TODO return GermanNormalizer().normalize(text, remove_articles) words = text.split() # this also removed extra spaces normalized = "" for word in words: if remove_articles and word in ["der", "die", "das", "des", "den", "dem"]: continue # Expand common contractions, e.g. "isn't" -> "is not" contraction = ["net", "nett"] if word in contraction: expansion = ["nicht", "nicht"] word = expansion[contraction.index(word)] # Convert numbers into digits, e.g. "two" -> "2" if word in _DE_NUMBERS: word = str(_DE_NUMBERS[word]) normalized += " " + word return normalized[1:] # strip the initial space def extract_numbers_de(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) class GermanNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_en.py000066400000000000000000001564501426211343400237240ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.lang.common_data_en import _ARTICLES_EN, _NUM_STRING_EN, \ _LONG_ORDINAL_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, \ _NEGATIVES_EN, _SUMS_EN, _MULTIPLIES_LONG_SCALE_EN, \ _MULTIPLIES_SHORT_SCALE_EN, _FRACTION_MARKER_EN, _DECIMAL_MARKER_EN, \ _STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \ _FRACTION_STRING_EN, _generate_plurals_en, _SPOKEN_EXTRA_NUM_EN import re import json from lingua_franca.internal import resolve_resource_file def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): """ Convert words in a string into their equivalent numbers. Args: text str: short_scale boolean: True if short scale numbers should be used. ordinals boolean: True if ordinals (e.g. first, second, third) should be parsed to their number values (1, 2, 3...) Returns: str The original text, with numbers subbed in where appropriate. """ tokens = tokenize(text) numbers_to_replace = \ _extract_numbers_with_text_en(tokens, short_scale, ordinals) numbers_to_replace.sort(key=lambda number: number.start_index) results = [] for token in tokens: if not numbers_to_replace or \ token.index < numbers_to_replace[0].start_index: results.append(token.word) else: if numbers_to_replace and \ token.index == numbers_to_replace[0].start_index: results.append(str(numbers_to_replace[0].value)) if numbers_to_replace and \ token.index == numbers_to_replace[0].end_index: numbers_to_replace.pop(0) return ' '.join(results) def _extract_numbers_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Extract all numbers from a list of Tokens, with the words that represent them. Args: [Token]: The tokens to parse. short_scale bool: True if short scale numbers should be used, False for long scale. True by default. ordinals bool: True if ordinal words (first, second, third, etc) should be parsed. fractional_numbers bool: True if we should look for fractions and decimals. Returns: [ReplaceableNumber]: A list of tuples, each containing a number and a string. """ placeholder = "" # inserted to maintain correct indices results = [] while True: to_replace = \ _extract_number_with_text_en(tokens, short_scale, ordinals, fractional_numbers) if not to_replace: break results.append(to_replace) tokens = [ t if not to_replace.start_index <= t.index <= to_replace.end_index else Token(placeholder, t.index) for t in tokens ] results.sort(key=lambda n: n.start_index) return results def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ This function extracts a number from a list of Tokens. Args: tokens str: the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and decimals. Returns: ReplaceableNumber """ number, tokens = \ _extract_number_with_text_en_helper(tokens, short_scale, ordinals, fractional_numbers) while tokens and tokens[0].word in _ARTICLES_EN: tokens.pop(0) return ReplaceableNumber(number, tokens) def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Helper for _extract_number_with_text_en. This contains the real logic for parsing, but produces a result that needs a little cleaning (specific, it may contain leading articles that can be trimmed off). Args: tokens [Token]: short_scale boolean: ordinals boolean: fractional_numbers boolean: Returns: int or float, [Tokens] """ if fractional_numbers: fraction, fraction_text = \ _extract_fraction_with_text_en(tokens, short_scale, ordinals) if fraction: return fraction, fraction_text decimal, decimal_text = \ _extract_decimal_with_text_en(tokens, short_scale, ordinals) if decimal: return decimal, decimal_text return _extract_whole_number_with_text_en(tokens, short_scale, ordinals) def _extract_fraction_with_text_en(tokens, short_scale, ordinals): """ Extract fraction numbers from a string. This function handles text such as '2 and 3/4'. Note that "one half" or similar will be parsed by the whole number function. Args: tokens [Token]: words and their indexes in the original string. short_scale boolean: ordinals boolean: Returns: (int or float, [Token]) The value found, and the list of relevant tokens. (None, None) if no fraction value is found. """ for c in _FRACTION_MARKER_EN: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_en(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_en(partitions[2], short_scale, ordinals, fractional_numbers=True) if not numbers1 or not numbers2: return None, None # ensure first is not a fraction and second is a fraction num1 = numbers1[-1] num2 = numbers2[0] if num1.value >= 1 and 0 < num2.value < 1: return num1.value + num2.value, \ num1.tokens + partitions[1] + num2.tokens return None, None def _extract_decimal_with_text_en(tokens, short_scale, ordinals): """ Extract decimal numbers from a string. This function handles text such as '2 point 5'. Notes: While this is a helper for extractnumber_en, it also depends on extractnumber_en, to parse out the components of the decimal. This does not currently handle things like: number dot number number number Args: tokens [Token]: The text to parse. short_scale boolean: ordinals boolean: Returns: (float, [Token]) The value found and relevant tokens. (None, None) if no decimal value is found. """ for c in _DECIMAL_MARKER_EN: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_en(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_en(partitions[2], short_scale, ordinals, fractional_numbers=False) if not numbers1 or not numbers2: return None, None number = numbers1[-1] decimal = numbers2[0] # TODO handle number dot number number number if "." not in str(decimal.text): return number.value + float('0.' + str(decimal.value)), \ number.tokens + partitions[1] + decimal.tokens return None, None def _extract_whole_number_with_text_en(tokens, short_scale, ordinals): """ Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [Token]: short_scale boolean: ordinals boolean: Returns: int or float, [Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data_en(short_scale, speech=ordinals is not None) number_words = [] # type: [Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word.lower() if word in _ARTICLES_EN or word in _NEGATIVES_EN: number_words.append(token) continue prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" if is_numeric(word[:-2]) and \ (word.endswith("st") or word.endswith("nd") or word.endswith("rd") or word.endswith("th")): # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth word = word[:-2] # handle nth one if next_word == "one": # would return 1 instead otherwise tokens[idx + 1] = Token("", idx) next_word = "" # TODO replaces the wall of "and" and "or" with all() or any() as # appropriate, the whole codebase should be checked for this pattern if word not in string_num_scale and \ word not in _STRING_NUM_EN and \ word not in _SUMS_EN and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not is_fractional_en(word, short_scale=short_scale) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] if number_words and not all([w.lower() in _ARTICLES_EN | _NEGATIVES_EN for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SUMS_EN \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES_EN \ and prev_word not in _ARTICLES_EN: number_words = [token] elif prev_word in _SUMS_EN and word in _SUMS_EN: number_words = [token] elif ordinals is None and \ (word in string_num_ordinal or word in _SPOKEN_EXTRA_NUM_EN): # flagged to ignore this token continue else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_EN: val = _STRING_NUM_EN.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal and val == 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if (prev_word in _SUMS_EN and val and val < 10) or all([prev_word in multiplies, val < prev_val if prev_val else False]): val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False and \ not (ordinals is None and word in string_num_ordinal): val = is_fractional_en(word, short_scale=short_scale, spoken=ordinals is not None) current_val = val # 2 fifths if ordinals is False: next_val = is_fractional_en(next_word, short_scale=short_scale) if next_val: if not val: val = 1 val = val * next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES_EN: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) current_val = val else: if current_val and all([ prev_word in _SUMS_EN, word not in _SUMS_EN, word not in multiplies, current_val >= 10]): # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val if word in multiplies and next_word not in multiplies: # handle long numbers # six hundred sixty six # two million five hundred thousand # # This logic is somewhat complex, and warrants # extensive documentation for the next coder's sake. # # The current word is a power of ten. `current_val` is # its integer value. `val` is our working sum # (above, when `current_val` is 1 million, `val` is # 2 million.) # # We have a dict `string_num_scale` containing [value, word] # pairs for "all" powers of ten: string_num_scale[10] == "ten. # # We need go over the rest of the tokens, looking for other # powers of ten. If we find one, we compare it with the current # value, to see if it's smaller than the current power of ten. # # Numbers which are not powers of ten will be passed over. # # If all the remaining powers of ten are smaller than our # current value, we can set the current value aside for later, # and begin extracting another portion of our final result. # For example, suppose we have the following string. # The current word is "million".`val` is 9000000. # `current_val` is 1000000. # # "nine **million** nine *hundred* seven **thousand** # six *hundred* fifty seven" # # Iterating over the rest of the string, the current # value is larger than all remaining powers of ten. # # The if statement passes, and nine million (9000000) # is appended to `to_sum`. # # The main variables are reset, and the main loop begins # assembling another number, which will also be appended # under the same conditions. # # By the end of the main loop, to_sum will be a list of each # "place" from 100 up: [9000000, 907000, 600] # # The final three digits will be added to the sum of that list # at the end of the main loop, to produce the extracted number: # # sum([9000000, 907000, 600]) + 57 # == 9,000,000 + 907,000 + 600 + 57 # == 9,907,657 # # >>> foo = "nine million nine hundred seven thousand six # hundred fifty seven" # >>> extract_number(foo) # 9907657 time_to_sum = True for other_token in tokens[idx+1:]: if other_token.word.lower() in multiplies: if string_num_scale[other_token.word.lower()] >= current_val: time_to_sum = False else: continue if not time_to_sum: break if time_to_sum: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words def _initialize_number_data_en(short_scale, speech=True): """ Generate dictionaries of words to numbers, based on scale. This is a helper function for _extract_whole_number. Args: short_scale (bool): speech (bool): consider extra words (_SPOKEN_EXTRA_NUM_EN) to be numbers Returns: (set(str), dict(str, number), dict(str, number)) multiplies, string_num_ordinal, string_num_scale """ multiplies = _MULTIPLIES_SHORT_SCALE_EN if short_scale \ else _MULTIPLIES_LONG_SCALE_EN string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN if short_scale \ else _STRING_LONG_ORDINAL_EN string_num_scale_en = _SHORT_SCALE_EN if short_scale else _LONG_SCALE_EN string_num_scale_en = invert_dict(string_num_scale_en) string_num_scale_en.update(_generate_plurals_en(string_num_scale_en)) if speech: string_num_scale_en.update(_SPOKEN_EXTRA_NUM_EN) return multiplies, string_num_ordinal_en, string_num_scale_en def extract_number_en(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value def extract_duration_en(text): """ Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None time_units = { 'microseconds': 0, 'milliseconds': 0, 'seconds': 0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0 } pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}s?" text = _convert_words_to_numbers_en(text) for unit_en in time_units: unit_pattern = pattern.format(unit=unit_en[:-1]) # remove 's' from unit def repl(match): time_units[unit_en] += float(match.group(1)) return '' text = re.sub(unit_pattern, repl, text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def extract_datetime_en(text, anchorDate=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: text (str): string containing date words anchorDate (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ def clean_string(s): # normalize and lowercase utt (replaces words with numbers) s = _convert_words_to_numbers_en(s, ordinals=None) # clean unneeded punctuation and capitalization among other things. s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ .replace("oclock", "o'clock").replace("couple", "2") \ .replace("centuries", "century").replace("decades", "decade") \ .replace("millenniums", "millennium") wordList = s.split() for idx, word in enumerate(wordList): word = word.replace("'s", "") ordinals = ["rd", "st", "nd", "th"] if word[0].isdigit(): for ordinal in ordinals: # "second" is the only case we should not do this if ordinal in word and "second" not in word: word = word.replace(ordinal, "") wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if not anchorDate: anchorDate = now_local() if text == "": return None found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 today = anchorDate.strftime("%w") currentYear = anchorDate.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersAM = ['morning'] timeQualifiersPM = ['afternoon', 'evening', 'night', 'tonight'] timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) year_markers = ['in', 'on', 'of'] markers = year_markers + ['at', 'by', 'this', 'around', 'for', "within"] days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] recur_markers = days + [d + 's' for d in days] + ['weekend', 'weekday', 'weekends', 'weekdays'] monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] year_multiples = ["decade", "century", "millennium"] day_multiples = ["weeks", "months", "years"] words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words word = word.rstrip('s') start = idx used = 0 # save timequalifier for later if word == "ago" and dayOffset: dayOffset = - dayOffset used += 1 if word == "now" and not datestr: resultStr = " ".join(words[idx + 1:]) resultStr = ' '.join(resultStr.split()) extractedDate = anchorDate.replace(microsecond=0) return [extractedDate, resultStr] elif wordNext in year_multiples: multiplier = None if is_numeric(word): multiplier = extract_number_en(word) multiplier = multiplier or 1 multiplier = int(multiplier) used += 2 if wordNext == "decade": yearOffset = multiplier * 10 elif wordNext == "century": yearOffset = multiplier * 100 elif wordNext == "millennium": yearOffset = multiplier * 1000 elif word in year_markers and is_numeric(wordNext) and len(wordNext) == 4: yearOffset = int(wordNext) - int(currentYear) used += 2 hasYear = True # couple of elif word == "2" and wordNext == "of" and \ wordNextNext in year_multiples: multiplier = 2 used += 3 if wordNextNext == "decade": yearOffset = multiplier * 10 elif wordNextNext == "century": yearOffset = multiplier * 100 elif wordNextNext == "millennium": yearOffset = multiplier * 1000 elif word == "2" and wordNext == "of" and \ wordNextNext in day_multiples: multiplier = 2 used += 3 if wordNextNext == "years": yearOffset = multiplier elif wordNextNext == "months": monthOffset = multiplier elif wordNextNext == "weeks": dayOffset = multiplier * 7 elif word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "today" and not fromFlag: dayOffset = 0 used += 1 elif word == "tomorrow" and not fromFlag: dayOffset = 1 used += 1 elif word == "day" and wordNext == "before" and wordNextNext == "yesterday" and not fromFlag: dayOffset = -2 used += 3 elif word == "before" and wordNext == "yesterday" and not fromFlag: dayOffset = -2 used += 2 elif word == "yesterday" and not fromFlag: dayOffset = -1 used += 1 elif (word == "day" and wordNext == "after" and wordNextNext == "tomorrow" and not fromFlag and (not wordPrev or not wordPrev[0].isdigit())): dayOffset = 2 used = 3 if wordPrev == "the": start -= 1 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "day": if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "week" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "next": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "last": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "month" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "next": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "last": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "year" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "next": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "last": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "next": if dayOffset <= 2: dayOffset += 7 used += 1 start -= 1 elif wordPrev == "last": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and (wordPrev[0].isdigit() or (wordPrev == "of" and wordPrevPrev[0].isdigit())): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # if no date indicators found, it may not be the month of May # may "i/we" ... # "... may be" elif word == 'may' and wordNext in ['i', 'we', 'be']: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("today") validFollowups.append("tomorrow") validFollowups.append("yesterday") validFollowups.append("next") validFollowups.append("last") validFollowups.append("now") validFollowups.append("this") if (word == "from" or word == "after") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "tomorrow": dayOffset += 1 elif wordNext == "yesterday": dayOffset -= 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "next": if dayOffset <= 2: tmpOffset += 7 used += 1 start -= 1 elif wordNext == "last": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] == "this": start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "noon": hrAbs = 12 used += 1 elif word == "midnight": hrAbs = 0 used += 1 elif word == "morning": if hrAbs is None: hrAbs = 8 used += 1 elif word == "afternoon": if hrAbs is None: hrAbs = 15 used += 1 elif word == "evening": if hrAbs is None: hrAbs = 19 used += 1 elif word == "tonight" or word == "night": if hrAbs is None: hrAbs = 22 # used += 1 ## NOTE this breaks other tests, TODO refactor me! # couple of time_unit elif word == "2" and wordNext == "of" and \ wordNextNext in ["hours", "minutes", "seconds"]: used += 3 if wordNextNext == "hours": hrOffset = 2 elif wordNextNext == "minutes": minOffset = 2 elif wordNextNext == "seconds": secOffset = 2 # parse half an hour, quarter hour elif word == "hour" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev == "half": minOffset = 30 elif wordPrev == "quarter": minOffset = 15 elif wordPrevPrev == "quarter": minOffset = 15 if idx > 2 and words[idx - 3] in markers: words[idx - 3] = "" words[idx - 2] = "" elif wordPrev == "within": hrOffset = 1 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "this": daySpecified = True words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc # parse in a minute elif word == "minute" and wordPrev == "in": minOffset = 1 words[idx - 1] = "" used += 1 # parse in a second elif word == "second" and wordPrev == "in": secOffset = 1 words[idx - 1] = "" used += 1 elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if wordNext == "tonight" or wordNextNext == "tonight" or \ wordPrev == "tonight" or wordPrevPrev == "tonight" or \ wordNextNextNext == "tonight": remainder = "pm" used += 1 if wordPrev == "tonight": words[idx - 1] = "" if wordPrevPrev == "tonight": words[idx - 2] = "" if wordNextNext == "tonight": used += 1 if wordNextNextNext == "tonight": used += 1 if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "morning": remainder = "am" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "afternoon": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "evening": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "morning": remainder = "am" used += 2 elif wordNext == "in" and wordNextNext == "afternoon": remainder = "pm" used += 2 elif wordNext == "in" and wordNextNext == "evening": remainder = "pm" used += 2 elif wordNext == "this" and wordNextNext == "morning": remainder = "am" used = 2 daySpecified = True elif wordNext == "this" and wordNextNext == "afternoon": remainder = "pm" used = 2 daySpecified = True elif wordNext == "this" and wordNextNext == "evening": remainder = "pm" used = 2 daySpecified = True elif wordNext == "at" and wordNextNext == "night": if strHH and int(strHH) > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": military = True if strHH and int(strHH) <= 12 and \ (timeQualifier in timeQualifiersPM): strHH += str(int(strHH) + 12) else: # try to parse numbers without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 elif ( remainder in recur_markers or wordNext in recur_markers or wordNextNext in recur_markers): # Ex: "7 on mondays" or "3 this friday" # Set strHH so that isTime == True # when am or pm is not specified strHH = strNum used = 1 else: if ( int(strNum) > 100 and ( wordPrev == "o" or wordPrev == "oh" )): # 0800 hours (pronounced oh-eight-hundred) strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if wordNext == "hours": used += 1 elif ( (wordNext == "hours" or wordNext == "hour" or remainder == "hours" or remainder == "hour") and word[0] != '0' and ( int(strNum) < 100 or int(strNum) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minutes" or wordNext == "minute" or \ remainder == "minutes" or remainder == "minute": # "in 10 minutes" minOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "seconds" or wordNext == "second" \ or remainder == "seconds" or remainder == "second": # in 5 seconds secOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(strNum) > 100: # military time, eg. "3300 hours" strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if wordNext == "hours" or wordNext == "hour" or \ remainder == "hours" or remainder == "hour": used += 1 elif wordNext and wordNext[0].isdigit(): # military time, e.g. "04 38 hours" strHH = strNum strMM = wordNext military = True used += 1 if (wordNextNext == "hours" or wordNextNext == "hour" or remainder == "hours" or remainder == "hour"): used += 1 elif ( wordNext == "" or wordNext == "o'clock" or ( wordNext == "in" and ( wordNextNext == "the" or wordNextNext == timeQualifier ) ) or wordNext == 'tonight' or wordNextNext == 'tonight'): strHH = strNum strMM = "00" if wordNext == "o'clock": used += 1 if wordNext == "in" or wordNextNext == "in": used += (1 if wordNext == "in" else 2) wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if (wordNextNext and (wordNextNext in timeQualifier or wordNextNextNext in timeQualifier)): if (wordNextNext in timeQualifiersPM or wordNextNextNext in timeQualifiersPM): remainder = "pm" used += 1 if (wordNextNext in timeQualifiersAM or wordNextNextNext in timeQualifiersAM): remainder = "am" used += 1 if timeQualifier != "": if timeQualifier in timeQualifiersPM: remainder = "pm" used += 1 elif timeQualifier in timeQualifiersAM: remainder = "am" used += 1 else: # TODO: Unsure if this is 100% accurate used += 1 military = True else: isTime = False HH = int(strHH) if strHH else 0 MM = int(strMM) if strMM else 0 HH = HH + 12 if remainder == "pm" and HH < 12 else HH HH = HH - 12 if remainder == "am" and HH >= 12 else HH if (not military and remainder not in ['am', 'pm', 'hours', 'minutes', "second", "seconds", "hour", "minute"] and ((not daySpecified) or 0 <= dayOffset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed if anchorDate.hour < HH or (anchorDate.hour == HH and anchorDate.minute < MM): pass # No modification needed elif anchorDate.hour < HH + 12: HH += 12 else: # has passed, assume the next morning dayOffset += 1 if timeQualifier in timeQualifiersPM and HH < 12: HH += 12 if HH > 24 or MM > 59: isTime = False used = 0 if isTime: hrAbs = HH minAbs = MM used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): if idx + i >= len(words): break words[idx + i] = "" if wordPrev == "o" or wordPrev == "oh": words[words.index(wordPrev)] = "" if wordPrev == "early": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "late": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if wordPrev == "this": daySpecified = True if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "this": daySpecified = True idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = anchorDate.replace(microsecond=0) if datestr != "": # date included an explicit date, e.g. "june 5" or "june 2, 2017" try: temp = datetime.strptime(datestr, "%B %d") except ValueError: # Try again, allowing the year temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if not hasYear: temp = temp.replace(year=extractedDate.year, tzinfo=extractedDate.tzinfo) if extractedDate < temp: extractedDate = extractedDate.replace( year=int(currentYear), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hrOffset == 0 and minOffset == 0 and secOffset == 0: extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: # If no time was supplied in the string set the time to default # time if it's available if hrAbs is None and minAbs is None and default_time is not None: hrAbs, minAbs = default_time.hour, default_time.minute else: hrAbs = hrAbs or 0 minAbs = minAbs or 0 extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and anchorDate > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "and" and \ words[idx - 1] == "" and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_en(input_str, short_scale=True, spoken=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False spoken (bool): consider "half", "quarter", "whole" a fraction Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} if short_scale: for num in _SHORT_ORDINAL_EN: if num > 2: fracts[_SHORT_ORDINAL_EN[num]] = num else: for num in _LONG_ORDINAL_EN: if num > 2: fracts[_LONG_ORDINAL_EN[num]] = num if input_str.lower() in fracts and spoken: return 1.0 / fracts[input_str.lower()] return False def extract_numbers_en(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] class EnglishNormalizer(Normalizer): with open(resolve_resource_file("text/en-us/normalize.json")) as f: _default_config = json.load(f) def numbers_to_digits(self, utterance): return _convert_words_to_numbers_en(utterance, ordinals=None) def normalize_en(text, remove_articles=True): """ English string normalization """ return EnglishNormalizer().normalize(text, remove_articles) lingua-franca-release-v0.4.3/lingua_franca/lang/parse_es.py000066400000000000000000001146311426211343400237240ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local from lingua_franca.lang.format_es import pronounce_number_es from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES def is_fractional_es(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: text (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" aFrac = {"medio": 2, "media": 2, "tercio": 3, "cuarto": 4, "cuarta": 4, "quinto": 5, "quinta": 5, "sexto": 6, "sexta": 6, "séptimo": 7, "séptima": 7, "octavo": 8, "octava": 8, "noveno": 9, "novena": 9, "décimo": 10, "décima": 10, "onceavo": 11, "onceava": 11, "doceavo": 12, "doceava": 12} if input_str.lower() in aFrac: return 1.0 / aFrac[input_str] if (input_str == "vigésimo" or input_str == "vigésima"): return 1.0 / 20 if (input_str == "trigésimo" or input_str == "trigésima"): return 1.0 / 30 if (input_str == "centésimo" or input_str == "centésima"): return 1.0 / 100 if (input_str == "milésimo" or input_str == "milésima"): return 1.0 / 1000 return False def extract_number_es(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. # # Returns incorrect output on certain fractional phrases like, "cuarto de dos" # TODO: numbers greater than 999999 aWords = text.lower().split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _STRING_NUM_ES: val = _STRING_NUM_ES[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif is_fractional_es(word): if not result: result = 1 result = result * is_fractional_es(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result = val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["y"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_es(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "cero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_es(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["punto", "coma", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "cero" or word == "0": zeros += 1 else: break afterDotVal = str(extract_number_es(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False def _es_number_parse(words, i): # TODO Not parsing 'cero' def es_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None def es_number_word(i, mi, ma): if i < len(words): v = _STRING_NUM_ES.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None def es_number_1_99(i): r1 = es_number_word(i, 1, 29) if r1: return r1 r1 = es_number_word(i, 30, 90) if r1: v1, i1 = r1 r2 = es_cte(i1, "y") if r2: i2 = r2[1] r3 = es_number_word(i2, 1, 9) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None def es_number_1_999(i): # [2-9]cientos [1-99]? r1 = es_number_word(i, 100, 900) if r1: v1, i1 = r1 r2 = es_number_1_99(i1) if r2: v2, i2 = r2 return v1 + v2, i2 else: return r1 # [1-99] r1 = es_number_1_99(i) if r1: return r1 return None def es_number(i): # check for cero r1 = es_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? r1 = es_number_1_999(i) if r1: v1, i1 = r1 r2 = es_cte(i1, "mil") if r2: i2 = r2[1] r3 = es_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 else: return v1 * 1000, i2 else: return r1 return None return es_number(i) def extract_numbers_es(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) def normalize_es(text, remove_articles=True): """ Spanish string normalization """ # TODO return SpanishNormalizer().normalize(text, remove_articles) words = text.split() # this also removed extra spaces normalized = "" i = 0 while i < len(words): word = words[i] if remove_articles and word in _ARTICLES_ES: i += 1 continue # Convert numbers into digits r = _es_number_parse(words, i) if r: v, i = r normalized += " " + str(v) continue normalized += " " + word i += 1 return normalized[1:] # strip the initial space # TODO MycroftAI/mycroft-core#2348 def extract_datetime_es(text, anchorDate=None, default_time=None): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", "º", "ª"] noise_words = ["entre", "la", "del", "al", "el", "de", "para", "una", "cualquier", "a", "e'", "esta", "este"] for word in symbols: s = s.replace(word, "") for word in noise_words: s = s.replace(" " + word + " ", " ") s = s.lower().replace( "á", "a").replace( "é", "e").replace( "ó", "o").replace( "-", " ").replace( "_", "") # handle synonyms and equivalents, "tomorrow early = tomorrow morning synonyms = {"mañana": ["amanecer", "temprano", "muy temprano"], "tarde": ["media tarde", "atardecer"], "noche": ["anochecer", "tarde"]} for syn in synonyms: for word in synonyms[syn]: s = s.replace(" " + word + " ", " " + syn + " ") # relevant plurals, cant just extract all s in pt wordlist = ["mañanas", "tardes", "noches", "días", "semanas", "años", "minutos", "segundos", "las", "los", "siguientes", "próximas", "próximos", "horas"] for _, word in enumerate(wordlist): s = s.replace(word, word.rstrip('s')) s = s.replace("meses", "mes").replace("anteriores", "anterior") return s def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None if anchorDate is None: anchorDate = now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" words = clean_string(text).split(" ") timeQualifiersList = ['mañana', 'tarde', 'noche'] time_indicators = ["en", "la", "al", "por", "pasados", "pasadas", "día", "hora"] days = ['lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo'] months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre'] monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', 'sep', 'oct', 'nov', 'dic'] nexts = ["siguiente", "próximo", "próxima"] suffix_nexts = ["siguientes", "subsecuentes"] lasts = ["último", "última"] suffix_lasts = ["pasada", "pasado", "anterior", "antes"] nxts = ["después", "siguiente", "próximo", "próxima"] prevs = ["antes", "previa", "previo", "anterior"] froms = ["desde", "en", "para", "después de", "por", "próximo", "próxima", "de"] thises = ["este", "esta"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, yesterday elif word == "hoy" and not fromFlag: dayOffset = 0 used += 1 elif word == "mañana" and not fromFlag: dayOffset = 1 used += 1 elif word == "ayer" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" elif (word == "anteayer" or (word == "ante" and wordNext == "ayer")) and not fromFlag: dayOffset -= 2 used += 1 if wordNext == "ayer": used += 1 elif word == "ante" and wordNext == "ante" and wordNextNext == \ "ayer" and not fromFlag: dayOffset -= 3 used += 3 elif word == "ante anteayer" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow elif word == "pasado" and wordNext == "mañana" and not fromFlag: dayOffset += 2 used = 2 # day before yesterday elif word == "ante" and wordNext == "ayer" and not fromFlag: dayOffset -= 2 used = 2 # parse 5 days, 10 weeks, last week, next week, week after elif word == "día": if wordNext == "pasado" or wordNext == "ante": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used += 1 elif (wordPrev and wordPrev[0].isdigit() and wordNext not in months and wordNext not in monthsShort): dayOffset += int(wordPrev) start -= 1 used += 2 elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ months and wordNextNext not in monthsShort: dayOffset += int(wordNext) start -= 1 used += 2 elif word == "semana" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 for w in nexts: if wordPrev == w: dayOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: dayOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: dayOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "mes" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: monthOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: monthOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: monthOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: monthOffset = -7 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "año" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: yearOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: yearOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: yearOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: yearOffset = -7 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "siguiente": dayOffset += 7 used += 1 start -= 1 elif wordPrev == "pasado": dayOffset -= 7 used += 1 start -= 1 if wordNext == "siguiente": # dayOffset += 7 used += 1 elif wordNext == "pasado": # dayOffset -= 7 used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): # 13 mayo datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): # mayo 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): # 13 dia mayo datestr += " " + wordPrevPrev start -= 2 used += 2 if wordNext and word[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): # mayo dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): datestr += " " + wordNextNextNext used += 1 hasYear = True else: hasYear = False if datestr in months: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("hoy") validFollowups.append("mañana") validFollowups.append("ayer") validFollowups.append("anteayer") validFollowups.append("ahora") validFollowups.append("ya") validFollowups.append("ante") # TODO debug word "depois" that one is failing for some reason if word in froms and wordNext in validFollowups: if not (wordNext == "mañana" and wordNext == "ayer") and not ( word == "pasado" or word == "antes"): used = 2 fromFlag = True if wordNext == "mañana" and word != "pasado": dayOffset += 1 elif wordNext == "ayer": dayOffset -= 1 elif wordNext == "anteayer": dayOffset -= 2 elif wordNext == "ante" and wordNextNext == "ayer": dayOffset -= 2 elif (wordNext == "ante" and wordNext == "ante" and wordNextNextNext == "ayer"): dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 # if wordNextNext == "feira": # used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: if wordNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNextNextNext: if wordNextNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset # if wordNextNextNext == "feira": # used += 1 if wordNext in months: used -= 1 if used > 0: if start - 1 > 0 and words[start - 1] in lists: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in lists: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "medio" and wordNext == "día": hrAbs = 12 used += 2 elif word == "media" and wordNext == "noche": hrAbs = 0 used += 2 elif word == "mañana": if not hrAbs: hrAbs = 8 used += 1 elif word == "tarde": if not hrAbs: hrAbs = 15 used += 1 elif word == "media" and wordNext == "tarde": if not hrAbs: hrAbs = 17 used += 2 elif word == "tarde" and wordNext == "noche": if not hrAbs: hrAbs = 20 used += 2 elif word == "media" and wordNext == "mañana": if not hrAbs: hrAbs = 10 used += 2 # elif word == "fim" and wordNext == "tarde": # if not hrAbs: # hrAbs = 19 # used += 2 # elif word == "fim" and wordNext == "manha": # if not hrAbs: # hrAbs = 11 # used += 2 elif word == "madrugada": if not hrAbs: hrAbs = 1 used += 2 elif word == "noche": if not hrAbs: hrAbs = 21 used += 1 # parse half an hour, quarter hour elif (word == "hora" and (wordPrev in time_indicators or wordPrevPrev in time_indicators)): if wordPrev == "media": minOffset = 30 elif wordPrev == "cuarto": minOffset = 15 elif wordPrevPrev == "cuarto": minOffset = 15 if idx > 2 and words[idx - 3] in time_indicators: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in time_indicators: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "mañana" or wordNext == "madrugada": remainder = "am" used += 1 elif wordNext == "tarde": remainder = "pm" used += 1 elif wordNext == "noche": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext in thises and wordNextNext == "mañana": remainder = "am" used = 2 elif wordNext in thises and wordNextNext == "tarde": remainder = "pm" used = 2 elif wordNext in thises and wordNextNext == "noche": remainder = "pm" used = 2 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "mañana" or timeQualifier == "tarde"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if (wordNext == "pm" or wordNext == "p.m." or wordNext == "tarde"): strHH = strNum remainder = "pm" used = 1 elif (wordNext == "am" or wordNext == "a.m." or wordNext == "mañana"): strHH = strNum remainder = "am" used = 1 elif (int(word) > 100 and ( # wordPrev == "o" or # wordPrev == "oh" or wordPrev == "cero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "hora": used += 1 elif ( wordNext == "hora" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minuto": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "segundo": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "hora": used += 1 elif wordNext == "" or ( wordNext == "en" and wordNextNext == "punto"): strHH = word strMM = 00 if wordNext == "en" and wordNextNext == "punto": used += 2 if wordNextNextNext == "tarde": remainder = "pm" used += 1 elif wordNextNextNext == "mañana": remainder = "am" used += 1 elif wordNextNextNext == "noche": if 0 > strHH > 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext used += 1 if wordNextNext == "hora": used += 1 else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if (remainder == "pm" and 0 < strHH < 12) else strHH strHH = strHH - 12 if (remainder == "am" and 0 < strHH >= 12) else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "en" or wordPrev == "punto": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: words[idx - 1] = "" if idx > 1 and wordPrevPrev in time_indicators: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=extractedDate.tzinfo) if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace( year=int(currentYear), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) # resultStr = pt_pruning(resultStr) return [extractedDate, resultStr] def get_gender_es(word, context=""): """ Guess the gender of a word Some languages assign genders to specific words. This method will attempt to determine the gender, optionally using the provided context sentence. Args: word (str): The word to look up context (str, optional): String containing word, for context Returns: str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, or None if unknown/or unused in the given language. """ # Next rules are imprecise and incompleted, but is a good starting point. # For more detailed explanation, see # http://www.wikilengua.org/index.php/Género_gramatical word = word.rstrip("s") gender = False words = context.split(" ") for idx, w in enumerate(words): if w == word and idx != 0: previous = words[idx - 1] gender = get_gender_es(previous) break if not gender: if word[-1] == "a": gender = "f" if word[-1] == "o" or word[-1] == "e": gender = "m" return gender class SpanishNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_eu.py000066400000000000000000001151701426211343400237250ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parse functions for Basque (eu) TODO: numbers greater than 999999 """ from datetime import datetime from dateutil.relativedelta import relativedelta from dateutil.tz import gettz from lingua_franca.lang.format_eu import pronounce_number_eu from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_eu import _NUM_STRING_EU def is_fractional_eu(input_str): """ This function takes the given text and checks if it is a fraction. Args: text (str): the string to check if fractional Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" aFrac = {"erdia": 2, "erdi": 2, "heren": 3, "laurden": 4, "laurdena": 4, "bosten": 5, "bostena": 5, "seiren": 6, "seirena": 6, "zazpiren": 7, "zapirena": 7, "zortziren": 8, "zortzirena": 8, "bederatziren": 9, "bederatzirena": 9, "hamarren": 10, "hamarrena": 10, "hamaikaren": 11, "hamaikarena": 11, "hamabiren": 12, "hamabirena": 12} if input_str.lower() in aFrac: return 1.0 / aFrac[input_str] if (input_str == "hogeiren" or input_str == "hogeirena"): return 1.0 / 20 if (input_str == "hogeita hamarren" or input_str == "hogeita hamarrena"): return 1.0 / 30 if (input_str == "ehunen" or input_str == "ehunena"): return 1.0 / 100 if (input_str == "milaren" or input_str == "milarena"): return 1.0 / 1000 return False # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. # # Returns incorrect output on certain fractional phrases like, "cuarto de dos" def extract_number_eu(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.lower().split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _NUM_STRING_EU: val = _NUM_STRING_EU[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif is_fractional_eu(word): if next_word in _NUM_STRING_EU: # erdi bat, heren bat, etab result = _NUM_STRING_EU[next_word] # hurrengo hitza (bat, bi, ...) salto egin next_word = None count += 2 elif not result: result = 1 count += 1 result = result * is_fractional_eu(word) continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word == "en" or next_word == "ren": result = float(result) / float(val) else: result = val if next_word is None: break # number word and fraction ands = ["eta"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_eu(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_eu(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["puntu", "koma", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extract_number_eu(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False # TODO Not parsing 'cero' def eu_number_parse(words, i): def eu_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None def eu_number_word(i, mi, ma): if i < len(words): v = _NUM_STRING_EU.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None def eu_number_1_99(i): if i >= len(words): return None r1 = eu_number_word(i, 1, 29) if r1: return r1 composed = False if words[i] != "eta" and words[i][-2:] == "ta": composed = True words[i] = words[i][:-2] r1 = eu_number_word(i, 20, 90) if r1: v1, i1 = r1 if composed: # i2 = r2[1] r3 = eu_number_word(i1, 1, 19) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None def eu_number_1_999(i): r1 = eu_number_word(i, 100, 900) if r1: v1, i1 = r1 r2 = eu_cte(i1, "eta") if r2: i2 = r2[1] r3 = eu_number_1_99(i2) if r3: v3, i3 = r3 return v1 + v3, i3 else: return r1 # [1-99] r1 = eu_number_1_99(i) if r1: return r1 return None def eu_number(i): # check for cero r1 = eu_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? r1 = eu_number_1_999(i) if r1: v1, i1 = r1 r2 = eu_cte(i1, "mila") if r2: i2 = r2[1] r3 = eu_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 else: return v1 * 1000, i2 else: return r1 return None return eu_number(i) def extract_numbers_eu(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_eu, extract_number_eu, short_scale=short_scale, ordinals=ordinals) def normalize_eu(text, remove_articles=True): """ Basque string normalization """ words = text.split() # this also removed extra spaces normalized = "" i = 0 while i < len(words): word = words[i] # Convert numbers into digits r = eu_number_parse(words, i) if r: v, i = r normalized += " " + str(v) continue normalized += " " + word i += 1 return normalized[1:] # strip the initial space return text # TODO MycroftAI/mycroft-core#2348 def extract_datetime_eu(input_str, anchorDate=None, default_time=None): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", "."] # noise_words = ["entre", "la", "del", "al", "el", "de", # "para", "una", "cualquier", "a", # "e'", "esta", "este"] # TODO noise_words = ["artean", "tartean", "edozein", "hau", "hontan", "honetan", "para", "una", "cualquier", "a", "e'", "esta", "este"] for word in symbols: s = s.replace(word, "") for word in noise_words: s = s.replace(" " + word + " ", " ") s = s.lower().replace( "-", " ").replace( "_", "") # handle synonyms and equivalents, "tomorrow early = tomorrow morning synonyms = {"goiza": ["egunsentia", "goiz", "oso goiz"], "arratsaldea": ["arratsa", "bazkalostea", "arratsalde", "arrats"], "gaua": ["iluntzea", "berandu", "gau", "gaba"]} for syn in synonyms: for word in synonyms[syn]: s = s.replace(" " + word + " ", " " + syn + " ") # relevant plurals wordlist = ["goizak", "arratsaldeak", "gauak", "egunak", "asteak", "urteak", "minutuak", "segunduak", "hurrengoak", "datozenak", "orduak", "hilabeteak"] for _, word in enumerate(wordlist): s = s.replace(word, word.rstrip('ak')) # s = s.replace("meses", "mes").replace("anteriores", "anterior") return s def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if input_str == "": return None if anchorDate is None: anchorDate = datetime.now() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate dateNow = dateNow.replace(tzinfo=None) today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" words = clean_string(input_str).split(" ") timeQualifiersList = ['goiza', 'arratsaldea', 'gaua'] time_indicators = ["en", "la", "al", "por", "pasados", "pasadas", "día", "hora"] days = ['astelehena', 'asteartea', 'asteazkena', 'osteguna', 'ostirala', 'larunbata', 'igandea'] months = ['urtarrila', 'otsaila', 'martxoa', 'apirila', 'maiatza', 'ekaina', 'uztaila', 'abuztua', 'iraila', 'urria', 'azaroa', 'abendua'] monthsShort = ['urt', 'ots', 'mar', 'api', 'mai', 'eka', 'uzt', 'abu', 'ira', 'urr', 'aza', 'abe'] nexts = ["hurrengo", "datorren", "ondorengo"] suffix_nexts = ["barru"] lasts = ["azken", "duela"] suffix_lasts = ["aurreko"] nxts = ["ondorengo", "hurrengo", "datorren"] prevs = ["aurreko", "duela", "previo", "anterior"] # TODO froms = ["desde", "en", "para", "después de", "por", "próximo", "próxima", "de"] thises = ["hau"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, yesterday elif (word == "gaur" or word == "gaurko") and not fromFlag: dayOffset = 0 used += 1 elif (word == "bihar" or word == "biharko") and not fromFlag: dayOffset = 1 used += 1 elif (word == "atzo" or word == "atzoko") and not fromFlag: dayOffset -= 1 used += 1 # before yesterday elif (word == "herenegun" or word == "herenegungo") and not fromFlag: dayOffset -= 2 used += 1 # if wordNext == "ayer": # used += 1 # elif word == "ante" and wordNext == "ante" and wordNextNext == \ # "ayer" and not fromFlag: # dayOffset -= 3 # used += 3 # elif word == "ante anteayer" and not fromFlag: # dayOffset -= 3 # used += 1 # day after tomorrow elif (word == "etzi" or word == "etziko") and not fromFlag: dayOffset += 2 used = 1 elif (word == "etzidamu" or word == "etzidamuko") and not fromFlag: dayOffset += 3 used = 1 # parse 5 days, 10 weeks, last week, next week, week after elif word == "egun" or word == "eguna" or word == "eguneko": if wordPrevPrev and wordPrevPrev == "duela": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset -= int(wordPrev) start -= 1 used += 1 elif (wordPrev and wordPrev[0].isdigit() and wordNext not in months and wordNext not in monthsShort): dayOffset += int(wordPrev) start -= 1 used += 2 elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ months and wordNextNext not in monthsShort: dayOffset += int(wordNext) start -= 1 used += 2 elif word == "aste" or word == "astea" or word == "asteko" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 for w in nexts: if wordPrev == w: dayOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: dayOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: dayOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "hilabete" or word == "hilabetea" or word == "hilabeteko" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: monthOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: monthOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: monthOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: monthOffset = -7 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "urte" or word == "urtea" or word == "urteko" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: yearOffset = 1 start -= 1 used = 2 for w in lasts: if wordPrev == w: yearOffset = -1 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: yearOffset = 1 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "hurrengo": dayOffset += 7 used += 1 start -= 1 elif wordPrev == "aurreko": dayOffset -= 7 used += 1 start -= 1 if wordNext == "hurrengo": # dayOffset += 7 used += 1 elif wordNext == "aurreko": # dayOffset -= 7 used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): # 13 mayo datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): # mayo 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): # 13 dia mayo datestr += " " + wordPrevPrev start -= 2 used += 2 if wordNext and word[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): # mayo dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): datestr += " " + wordNextNextNext used += 1 hasYear = True else: hasYear = False if datestr in months: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("gaur") validFollowups.append("bihar") validFollowups.append("atzo") # validFollowups.append("atzoko") validFollowups.append("herenegun") validFollowups.append("orain") validFollowups.append("oraintxe") # validFollowups.append("ante") # TODO if word in froms and wordNext in validFollowups: if not (word == "bihar" or word == "herenegun" or word == "atzo"): used = 1 fromFlag = True if wordNext == "bihar": dayOffset += 1 elif wordNext == "atzo" or wordNext == "atzoko": dayOffset -= 1 elif wordNext == "herenegun": dayOffset -= 2 # elif (wordNext == "ante" and wordNext == "ante" and # wordNextNextNext == "ayer"): # dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 # if wordNextNext == "feira": # used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: if wordNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNextNextNext: if wordNextNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset # if wordNextNextNext == "feira": # used += 1 if wordNext in months: used -= 1 if used > 0: if start - 1 > 0 and words[start - 1] in lists: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in lists: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "eguerdi" or word == "eguerdia" or word == "eguerdian": hrAbs = 12 used += 2 elif word == "gauerdi" or word == "gauerdia" or word == "gauerdian": hrAbs = 0 used += 2 elif word == "goiza": if not hrAbs: hrAbs = 8 used += 1 elif word == "arratsaldea" or word == "arratsa" or word == "arratsean" or word == "arratsaldean": if not hrAbs: hrAbs = 15 used += 1 # TODO # elif word == "media" and wordNext == "tarde": # if not hrAbs: # hrAbs = 17 # used += 2 elif word == "iluntze" or word == "iluntzea" or word == "iluntzean": if not hrAbs: hrAbs = 20 used += 2 # TODO # elif word == "media" and wordNext == "mañana": # if not hrAbs: # hrAbs = 10 # used += 2 # elif word == "fim" and wordNext == "tarde": # if not hrAbs: # hrAbs = 19 # used += 2 elif word == "egunsentia" or word == "egunsentian" or word == "egunsenti": if not hrAbs: hrAbs = 6 used += 1 # elif word == "madrugada": # if not hrAbs: # hrAbs = 1 # used += 2 elif word == "gaua" or word == "gauean" or word == "gau": if not hrAbs: hrAbs = 21 used += 1 # parse half an hour, quarter hour # TODO elif (word == "hora" and (wordPrev in time_indicators or wordPrevPrev in time_indicators)): if wordPrev == "media": minOffset = 30 elif wordPrev == "cuarto": minOffset = 15 elif wordPrevPrev == "cuarto": minOffset = 15 if idx > 2 and words[idx - 3] in time_indicators: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in time_indicators: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "goiza" or wordNext == "egunsentia" or wordNext == "goizeko" or wordNext == "egunsentiko": remainder = "am" used += 1 elif wordPrev == "arratsaldeko" or wordPrev == "arratsaldea" or wordPrev == "arratsaldean": remainder = "pm" used += 1 elif wordNext == "gaua" or wordNext == "gauean" or wordNext == "gaueko": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext in thises and (wordNextNext == "goiza" or wordNextNext == "goizean" or wordNextNext == "goizeko"): remainder = "am" used = 2 elif wordNext in thises and \ (wordNextNext == "arratsaldea" or wordNextNext == "arratsaldean" or wordNextNext == "arratsaldeko"): remainder = "pm" used = 2 elif wordNext in thises and (wordNextNext == "gaua" or wordNextNext == "gauean" or wordNextNext == "gaueko"): remainder = "pm" used = 2 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "goiza" or timeQualifier == "arratsaldea"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if (wordNext == "pm" or wordNext == "p.m." or wordPrev == "arratsaldeko"): strHH = strNum remainder = "pm" used = 0 elif (wordNext == "am" or wordNext == "a.m." or wordPrev == "goizeko"): strHH = strNum remainder = "am" used = 0 elif (int(word) > 100 and ( # wordPrev == "o" or # wordPrev == "oh" or wordPrev == "zero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "orduak": used += 1 elif ( wordNext == "orduak" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minutu": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "segundu": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "ordu": used += 1 elif wordNext == "" or ( wordNext == "puntuan"): strHH = word strMM = 00 if wordNext == "puntuan": used += 2 if wordNextNextNext == "arratsaldea": remainder = "pm" used += 1 elif wordNextNextNext == "goiza": remainder = "am" used += 1 elif wordNextNextNext == "gaua": if 0 > strHH > 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext used += 1 if wordNextNext == "orduak": used += 1 else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if (remainder == "pm" and 0 < strHH < 12) else strHH strHH = strHH - 12 if (remainder == "am" and 0 < strHH >= 12) else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "puntuan": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: words[idx - 1] = "" if idx > 1 and wordPrevPrev in time_indicators: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) extractedDate = extractedDate.replace(tzinfo=None) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") temp = temp.replace(tzinfo=None) if not hasYear: temp = temp.replace(year=extractedDate.year) print(gettz(temp.tzname())) print(extractedDate.tzname(), temp.tzname()) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) # resultStr = pt_pruning(resultStr) return [extractedDate, resultStr] def get_gender_eu(word, raw_string=""): # There is no gender in Basque gender = False return gender lingua-franca-release-v0.4.3/lingua_franca/lang/parse_fa.py000066400000000000000000000302021426211343400236720ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json from datetime import timedelta from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.common_data_fa import (_FARSI_BIG, _FARSI_HUNDREDS, _FARSI_ONES, _FARSI_TENS, _FORMAL_VARIANT) from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local def _is_number(s): try: float(s) return True except ValueError: return False def _parse_sentence(text): for key, value in _FORMAL_VARIANT.items(): text = text.replace(key, value) ar = text.split() result = [] current_number = 0 current_words = [] s = 0 step = 10 mode = 'init' def finish_num(): nonlocal current_number nonlocal s nonlocal result nonlocal mode nonlocal current_words current_number += s if current_number != 0: result.append((current_number, current_words)) s = 0 current_number = 0 current_words = [] mode = 'init' for x in ar: if x == "و": if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': mode += '_va' current_words.append(x) elif mode == 'num': current_words.append(x) else: finish_num() result.append(x) elif x == "نیم": current_words.append(x) current_number += 0.5 finish_num() elif x in _FARSI_ONES: t = _FARSI_ONES.index(x) if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': if not(t < 10 and mode == 'num_ten_va'): finish_num() current_words.append(x) s += t mode = 'num_one' elif x in _FARSI_TENS: if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': finish_num() current_words.append(x) s += _FARSI_TENS.index(x)*10 mode = 'num_ten' elif x in _FARSI_HUNDREDS: if mode != 'init' and mode != 'num': finish_num() current_words.append(x) s += _FARSI_HUNDREDS.index(x)*100 mode = 'num_hundred' elif x in _FARSI_BIG: current_words.append(x) d = _FARSI_BIG.index(x) if mode == 'init' and d == 1: s = 1 s *= 10**(3*d) current_number += s s = 0 mode = 'num' elif _is_number(x): current_words.append(x) current_number = float(x) finish_num() else: finish_num() result.append(x) if mode[:3] == 'num': finish_num() return result _time_units = { 'ثانیه': timedelta(seconds=1), 'دقیقه': timedelta(minutes=1), 'ساعت': timedelta(hours=1), } _date_units = { 'روز': timedelta(days=1), 'هفته': timedelta(weeks=1), } def extract_duration_fa(text): """ Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ remainder = [] ar = _parse_sentence(text) current_number = None result = timedelta(0) for x in ar: if x == "و": continue elif type(x) == tuple: current_number = x elif x in _time_units: result += _time_units[x] * current_number[0] current_number = None elif x in _date_units: result += _date_units[x] * current_number[0] current_number = None else: if current_number: remainder.extend(current_number[1]) remainder.append(x) current_number = None return (result, " ".join(remainder)) def extract_datetime_fa(text, anchorDate=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: text (str): string containing date words anchorDate (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ if text == "": return None text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ .replace('?', '').replace("پس فردا", "پسفردا") \ .replace('یک شنبه', 'یکشنبه') \ .replace('دو شنبه', 'دوشنبه') \ .replace('سه شنبه', 'سهشنبه') \ .replace('چهار شنبه', 'چهارشنبه') \ .replace('پنج شنبه', 'پنجشنبه') \ .replace('بعد از ظهر', 'بعدازظهر') \ if not anchorDate: anchorDate = now_local() today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) today_weekday = int(anchorDate.strftime("%w")) weekday_names = [ 'دوشنبه', 'سهشنبه', 'چهارشنبه', 'پنجشنبه', 'جمعه', 'شنبه', 'یکشنبه', ] daysDict = { 'پریروز': today + timedelta(days= -2), 'دیروز': today + timedelta(days= -1), 'امروز': today, 'فردا': today + timedelta(days= 1), 'پسفردا': today + timedelta(days= 2), } timesDict = { 'صبح': timedelta(hours=8), 'بعدازظهر': timedelta(hours=15), } exactDict = { 'الان': anchorDate, } nextWords = ["بعد", "دیگه"] prevWords = ["پیش", "قبل"] ar = _parse_sentence(text) mode = 'none' number_seen = None delta_seen = timedelta(0) remainder = [] result = None for x in ar: handled = 1 if mode == 'finished': remainder.append(x) elif x == 'و' and mode[:5] == 'delta': pass elif type(x) == tuple: number_seen = x elif x in weekday_names: dayOffset = (weekday_names.index(x) + 1) - today_weekday if dayOffset < 0: dayOffset += 7 result = today + timedelta(days=dayOffset) mode = 'time' elif x in exactDict: result = exactDict[x] mode = 'finished' elif x in daysDict: result = daysDict[x] mode = 'time' elif x in timesDict and mode == 'time': result += timesDict[x] mode = 'finish' elif x in _date_units: k = 1 if (number_seen): k = number_seen[0] number_seen = None delta_seen += _date_units[x] * k if mode != 'delta_time': mode = 'delta_date' elif x in _time_units: k = 1 if (number_seen): k = number_seen[0] number_seen = None delta_seen += _time_units[x] * k mode = 'delta_time' elif x in nextWords or x in prevWords: # Give up instead of incorrect result if mode == 'time': return None sign = 1 if x in nextWords else -1 if mode == 'delta_date': result = today + delta_seen mode = 'time' elif mode == 'delta_time': result = anchorDate + delta_seen mode = 'finished' else: handled = 0 else: handled = 0 if handled == 1: continue if number_seen: remainder.extend(number_seen[1]) number_seen = None remainder.append(x) return (result, " ".join(remainder)) def is_fractional_fa(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} if short_scale: for num in _SHORT_ORDINAL_FA: if num > 2: fracts[_SHORT_ORDINAL_FA[num]] = num else: for num in _LONG_ORDINAL_FA: if num > 2: fracts[_LONG_ORDINAL_FA[num]] = num if input_str.lower() in fracts: return 1.0 / fracts[input_str.lower()] return False def extract_numbers_fa(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ ar = _parse_sentence(text) result = [] for x in ar: if type(x) == tuple: result.append(x[0]) return result def extract_number_fa(text, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False return x[0] lingua-franca-release-v0.4.3/lingua_franca/lang/parse_fr.py000066400000000000000000001145341426211343400237260ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import re from dateutil.tz import gettz from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic, Normalizer from lingua_franca.lang.format_fr import pronounce_number_fr from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_franca.time import now_local def extract_duration_fr(text): """ Convert an french phrase into a number of seconds Convert things like: "10 minutes" "3 jours 8 heures 10 minutes und 49 secondes" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None text = normalize_fr(text) time_units = { 'microseconds': 'microsecondes', 'milliseconds': 'millisecondes', 'seconds': 'secondes', 'minutes': 'minutes', 'hours': 'heures', 'days': 'jours', 'weeks': 'semaines' } pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[s]?(\s+|,|$)" for (unit_en, unit_fr) in time_units.items(): unit_pattern = pattern.format(unit=unit_fr[:-1]) # remove 's' from unit time_units[unit_en] = 0 def repl(match): time_units[unit_en] += float(match.group(1)) return '' text = re.sub(unit_pattern, repl, text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def _number_parse_fr(words, i): """ Parses a list of words to find a number Takes in a list of words (strings without whitespace) and extracts a number that starts at the given index. Args: words (array): the list to extract a number from i (int): the index in words where to look for the number Returns: tuple with number, index of next word after the number. Returns None if no number was found. """ def cte_fr(i, s): # Check if string s is equal to words[i]. # If it is return tuple with s, index of next word. # If it is not return None. if i < len(words) and s == words[i]: return s, i + 1 return None def number_word_fr(i, mi, ma): # Check if words[i] is a number in _NUMBERS_FR between mi and ma. # If it is return tuple with number, index of next word. # If it is not return None. if i < len(words): val = _NUMBERS_FR.get(words[i]) # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000] if val is not None: if val >= mi and val <= ma: return val, i + 1 else: return None # The number may be hyphenated (numbers [17-999]) splitWord = words[i].split('-') if len(splitWord) > 1: val1 = _NUMBERS_FR.get(splitWord[0]) if val1: i1 = 0 val2 = 0 val3 = 0 if val1 < 10 and splitWord[1] == "cents": val1 = val1 * 100 i1 = 2 # For [81-99], e.g. "quatre-vingt-deux" if len(splitWord) > i1 and splitWord[0] == "quatre" and \ splitWord[1] == "vingt": val1 = 80 i1 += 2 # We still found a number if i1 == 0: i1 = 1 if len(splitWord) > i1: # For [21,31,41,51,61,71] if len(splitWord) > i1 + 1 and splitWord[i1] == "et": val2 = _NUMBERS_FR.get(splitWord[i1 + 1]) if val2 is not None: i1 += 2 # For [77-79],[97-99] e.g. "soixante-dix-sept" elif splitWord[i1] == "dix" and \ len(splitWord) > i1 + 1: val2 = _NUMBERS_FR.get(splitWord[i1 + 1]) if val2 is not None: val2 += 10 i1 += 2 else: val2 = _NUMBERS_FR.get(splitWord[i1]) if val2 is not None: i1 += 1 if len(splitWord) > i1: val3 = _NUMBERS_FR.get(splitWord[i1]) if val3 is not None: i1 += 1 if val2: if val3: val = val1 + val2 + val3 else: val = val1 + val2 else: return None if i1 == len(splitWord) and val and ma >= val >= mi: return val, i + 1 return None def number_1_99_fr(i): # Check if words[i] is a number between 1 and 99. # If it is return tuple with number, index of next word. # If it is not return None. # Is it a number between 1 and 16? result1 = number_word_fr(i, 1, 16) if result1: return result1 # Is it a number between 10 and 99? result1 = number_word_fr(i, 10, 99) if result1: val1, i1 = result1 result2 = cte_fr(i1, "et") # If the number is not hyphenated [21,31,41,51,61,71] if result2: i2 = result2[1] result3 = number_word_fr(i2, 1, 11) if result3: val3, i3 = result3 return val1 + val3, i3 return result1 # It is not a number return None def number_1_999_fr(i): # Check if words[i] is a number between 1 and 999. # If it is return tuple with number, index of next word. # If it is not return None. # Is it 100 ? result = number_word_fr(i, 100, 100) # Is it [200,300,400,500,600,700,800,900]? if not result: resultH1 = number_word_fr(i, 2, 9) if resultH1: valH1, iH1 = resultH1 resultH2 = number_word_fr(iH1, 100, 100) if resultH2: iH2 = resultH2[1] result = valH1 * 100, iH2 if result: val1, i1 = result result2 = number_1_99_fr(i1) if result2: val2, i2 = result2 return val1 + val2, i2 else: return result # Is it hyphenated? [101-999] result = number_word_fr(i, 101, 999) if result: return result # [1-99] result = number_1_99_fr(i) if result: return result return None def number_1_999999_fr(i): """ Find a number in a list of words Checks if words[i] is a number between 1 and 999,999. Args: i (int): the index in words where to look for the number Returns: tuple with number, index of next word after the number. Returns None if no number was found. """ # check for zero result1 = number_word_fr(i, 0, 0) if result1: return result1 # check for [1-999] result1 = number_1_999_fr(i) if result1: val1, i1 = result1 else: val1 = 1 i1 = i # check for 1000 result2 = number_word_fr(i1, 1000, 1000) if result2: # it's [1000-999000] i2 = result2[1] # check again for [1-999] result3 = number_1_999_fr(i2) if result3: val3, i3 = result3 return val1 * 1000 + val3, i3 else: return val1 * 1000, i2 elif result1: return result1 return None return number_1_999999_fr(i) def _get_ordinal_fr(word): """ Get the ordinal number Takes in a word (string without whitespace) and extracts the ordinal number. Args: word (string): the word to extract the number from Returns: number (int) Returns None if no ordinal number was found. """ if word: for ordinal in _ORDINAL_ENDINGS_FR: if word[0].isdigit() and ordinal in word: result = word.replace(ordinal, "") if result.isdigit(): return int(result) return None def _number_ordinal_fr(words, i): """ Find an ordinal number in a list of words Takes in a list of words (strings without whitespace) and extracts an ordinal number that starts at the given index. Args: words (array): the list to extract a number from i (int): the index in words where to look for the ordinal number Returns: tuple with ordinal number (str), index of next word after the number (int). Returns None if no ordinal number was found. """ val1 = None strOrd = "" # it's already a digit, normalize to "1er" or "5e" val1 = _get_ordinal_fr(words[i]) if val1 is not None: if val1 == 1: strOrd = "1er" else: strOrd = str(val1) + "e" return strOrd, i + 1 # if it's a big number the beginning should be detected as a number result = _number_parse_fr(words, i) if result: val1, i = result else: val1 = 0 if i < len(words): word = words[i] if word in ["premier", "première"]: strOrd = "1er" elif word == "second": strOrd = "2e" elif word.endswith("ième"): val2 = None word = word[:-4] # centième if word == "cent": if val1: strOrd = str(val1 * 100) + "e" else: strOrd = "100e" # millième elif word == "mill": if val1: strOrd = str(val1 * 1000) + "e" else: strOrd = "1000e" else: # "cinquième", "trente-cinquième" if word.endswith("cinqu"): word = word[:-1] # "neuvième", "dix-neuvième" elif word.endswith("neuv"): word = word[:-1] + "f" result = _number_parse_fr([word], 0) if not result: # "trentième", "douzième" word = word + "e" result = _number_parse_fr([word], 0) if result: val2, i = result if val2 is not None: strOrd = str(val1 + val2) + "e" if strOrd: return strOrd, i + 1 return None def extract_number_fr(text, short_scale=True, ordinals=False): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from Returns: (str): The number extracted or the original text. """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. # normalize text, keep articles for ordinals versus fractionals text = normalize_fr(text, False) # split words by whitespace aWords = text.split() count = 0 result = None add = False while count < len(aWords): val = None word = aWords[count] wordNext = "" wordPrev = "" if count < (len(aWords) - 1): wordNext = aWords[count + 1] if count > 0: wordPrev = aWords[count - 1] if word in _ARTICLES_FR: count += 1 continue if word in ["et", "plus", "+"]: count += 1 add = True continue # is current word a numeric number? if word.isdigit(): val = int(word) count += 1 elif is_numeric(word): val = float(word) count += 1 elif wordPrev in _ARTICLES_FR and _get_ordinal_fr(word): val = _get_ordinal_fr(word) count += 1 # is current word the denominator of a fraction? elif is_fractional_fr(word): val = is_fractional_fr(word) count += 1 # is current word the numerator of a fraction? if val and wordNext: valNext = is_fractional_fr(wordNext) if valNext: val = float(val) * valNext count += 1 if not val: count += 1 # is current word a numeric fraction like "2/3"? aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) # is current word followed by a decimal value? if wordNext == "virgule": zeros = 0 newWords = aWords[count + 1:] # count the number of zeros after the decimal sign for word in newWords: if word == "zéro" or word == "0": zeros += 1 else: break afterDotVal = None # extract the number after the zeros if newWords[zeros].isdigit(): afterDotVal = newWords[zeros] countDot = count + zeros + 2 # if a number was extracted (since comma is also a # punctuation sign) if afterDotVal: count = countDot if not val: val = 0 # add the zeros afterDotString = zeros * "0" + afterDotVal val = float(str(val) + "." + afterDotString) if val: if add: result += val add = False else: result = val return result or False def extract_datetime_fr(text, anchorDate=None, default_time=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. """ s = normalize_fr(s, True) wordList = s.split() for idx, word in enumerate(wordList): # remove comma and dot if it's not a number if word[-1] in [",", "."]: word = word[:-1] wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset or (isTime and (hrAbs or minAbs)) or hrOffset != 0 or minOffset != 0 or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ["matin", "après-midi", "soir", "nuit"] words_in = ["dans", "après"] markers = ["à", "dès", "autour", "vers", "environs", "ce", "cette"] + words_in days = ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche"] months = ["janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"] monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû", "sept", "oct", "nov", "déc"] # needed for format functions months_en = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word used = 1 if wordPrev in ["ce", "cet", "cette"]: used = 2 start -= 1 # parse aujourd'hui, demain, après-demain elif word == "aujourd'hui" and not fromFlag: dayOffset = 0 used += 1 elif word == "demain" and not fromFlag: dayOffset = 1 used += 1 elif word == "après-demain" and not fromFlag: dayOffset = 2 used += 1 # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine elif word in ["jour", "jours"]: if wordPrev.isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 # "3e jour" elif _get_ordinal_fr(wordPrev) is not None: dayOffset += _get_ordinal_fr(wordPrev) - 1 start -= 1 used = 2 elif word in ["semaine", "semaines"] and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordNext in ["prochaine", "suivante"]: dayOffset = 7 used = 2 elif wordNext in ["dernière", "précédente"]: dayOffset = -7 used = 2 # parse 10 mois, mois prochain, mois dernier elif word == "mois" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordNext in ["prochain", "suivant"]: monthOffset = 1 used = 2 elif wordNext in ["dernier", "précédent"]: monthOffset = -1 used = 2 # parse 5 ans, an prochain, année dernière elif word in ["an", "ans", "année", "années"] and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]: yearOffset = 1 used = 2 elif wordNext in ["dernier", "dernière", "précédent", "précédente"]: yearOffset = -1 used = 2 # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordNext in ["prochain", "suivant"]: dayOffset += 7 used += 1 elif wordNext in ["dernier", "précédent"]: dayOffset -= 7 used += 1 # parse 15 juillet, 15 juil elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months_en[m] if wordPrev and (wordPrev[0].isdigit()): datestr += " " + wordPrev start -= 1 used += 1 else: datestr += " 1" if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False # parse 5 jours après demain, 10 semaines après jeudi prochain, # 2 mois après juillet validFollowups = days + months + monthsShort validFollowups.append("aujourd'hui") validFollowups.append("demain") validFollowups.append("prochain") validFollowups.append("prochaine") validFollowups.append("suivant") validFollowups.append("suivante") validFollowups.append("dernier") validFollowups.append("dernière") validFollowups.append("précédent") validFollowups.append("précédente") validFollowups.append("maintenant") if word in ["après", "depuis"] and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "demain": dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if wordNextNext == "prochain": tmpOffset += 7 used += 1 elif wordNextNext == "dernier": tmpOffset -= 7 used += 1 elif tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] in ["ce", "cette"]: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None ampm = "" isTime = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" used = 0 start = idx # parse midi et quart, minuit et demi, midi 10, minuit moins 20 if word in ["midi", "minuit"]: isTime = True if word == "midi": hrAbs = 12 used += 1 elif word == "minuit": hrAbs = 0 used += 1 if wordNext.isdigit(): minAbs = int(wordNext) used += 1 elif wordNext == "et": if wordNextNext == "quart": minAbs = 15 used += 2 elif wordNextNext == "demi": minAbs = 30 used += 2 elif wordNext == "moins": if wordNextNext.isdigit(): minAbs = 60 - int(wordNextNext) if not hrAbs: hrAbs = 23 else: hrAbs -= 1 used += 2 if wordNextNext == "quart": minAbs = 45 if not hrAbs: hrAbs = 23 else: hrAbs -= 1 used += 2 # parse une demi-heure, un quart d'heure elif word == "demi-heure" or word == "heure" and \ (wordPrevPrev in markers or wordPrevPrevPrev in markers): used = 1 isTime = True if word == "demi-heure": minOffset = 30 elif wordPrev == "quart": minOffset = 15 used += 1 start -= 1 elif wordPrev == "quarts" and wordPrevPrev.isdigit(): minOffset = int(wordPrevPrev) * 15 used += 1 start -= 1 if wordPrev.isdigit() or wordPrevPrev.isdigit(): start -= 1 used += 1 # parse 5:00 du matin, 12:00, etc elif word[0].isdigit() and _get_ordinal_fr(word) is None: isTime = True if ":" in word or "h" in word or "min" in word: # parse hours on short format # "3:00 du matin", "4h14", "3h15min" strHH = "" strMM = "" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] used = 1 elif word[i] in [":", "h", "m"]: stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] used = 1 else: stage = 2 if word[i:i + 3] == "min": i += 1 elif stage == 2: break if wordPrev in words_in: hrOffset = int(strHH) if strHH else 0 minOffset = int(strMM) if strMM else 0 else: hrAbs = int(strHH) if strHH else 0 minAbs = int(strMM) if strMM else 0 else: # try to parse time without colons # 5 hours, 10 minutes etc. length = len(word) ampm = "" if ( word.isdigit() and wordNext in ["heures", "heure"] and word != "0" and ( int(word) < 100 or int(word) > 2400 )): # "dans 3 heures", "à 3 heures" if wordPrev in words_in: hrOffset = int(word) else: hrAbs = int(word) used = 2 idxHr = idx + 2 # "dans 1 heure 40", "à 1 heure 40" if idxHr < len(words): # "3 heures 45" if words[idxHr].isdigit(): if wordPrev in words_in: minOffset = int(words[idxHr]) else: minAbs = int(words[idxHr]) used += 1 idxHr += 1 # "3 heures et quart", "4 heures et demi" elif words[idxHr] == "et" and idxHr + 1 < len(words): if words[idxHr + 1] == "quart": if wordPrev in words_in: minOffset = 15 else: minAbs = 15 used += 2 idxHr += 2 elif words[idxHr + 1] == "demi": if wordPrev in words_in: minOffset = 30 else: minAbs = 30 used += 2 idxHr += 2 # "5 heures moins 20", "6 heures moins le quart" elif words[idxHr] == "moins" and \ idxHr + 1 < len(words): if words[idxHr + 1].isdigit(): if wordPrev in words_in: hrOffset -= 1 minOffset = 60 - int(words[idxHr + 1]) else: hrAbs = hrAbs - 1 minAbs = 60 - int(words[idxHr + 1]) used += 2 idxHr += 2 elif words[idxHr + 1] == "quart": if wordPrev in words_in: hrOffset -= 1 minOffset = 45 else: hrAbs = hrAbs - 1 minAbs = 45 used += 2 idxHr += 2 # remove word minutes if present if idxHr < len(words) and \ words[idxHr] in ["minutes", "minute"]: used += 1 idxHr += 1 elif wordNext == "minutes": # "dans 10 minutes" if wordPrev in words_in: minOffset = int(word) else: minAbs = int(word) used = 2 elif wordNext == "secondes": # "dans 5 secondes" secOffset = int(word) used = 2 elif int(word) > 100: # format militaire hrAbs = int(word) / 100 minAbs = int(word) - hrAbs * 100 used = 1 if wordNext == "heures": used += 1 # handle am/pm if timeQualifier: if timeQualifier == "matin": ampm = "am" elif timeQualifier == "après-midi": ampm = "pm" elif timeQualifier == "soir": ampm = "pm" elif timeQualifier == "nuit": if (hrAbs or 0) > 8: ampm = "pm" else: ampm = "am" hrAbs = ((hrAbs or 0) + 12 if ampm == "pm" and (hrAbs or 0) < 12 else hrAbs) hrAbs = ((hrAbs or 0) - 12 if ampm == "am" and (hrAbs or 0) >= 12 else hrAbs) if (hrAbs or 0) > 24 or ((minAbs or 0) > 59): isTime = False used = 0 elif wordPrev in words_in: isTime = False else: isTime = True elif not hrAbs and timeQualifier: if timeQualifier == "matin": hrAbs = 8 elif timeQualifier == "après-midi": hrAbs = 15 elif timeQualifier == "soir": hrAbs = 19 elif timeQualifier == "nuit": hrAbs = 2 isTime = True if used > 0: # removed parsed words from the sentence for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": if not hasYear: temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=gettz("UTC")) temp = temp.astimezone(extractedDate.tzinfo) temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "et" and words[idx - 1] == "" and \ words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_fr(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ input_str = input_str.lower() if input_str != "tiers" and input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "quarts" aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième", "septième", "huitième", "neuvième", "dixième", "onzième", "douzième", "treizième", "quatorzième", "quinzième", "seizième", "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"] if input_str in aFrac: return 1.0 / (aFrac.index(input_str) + 1) if _get_ordinal_fr(input_str): return 1.0 / _get_ordinal_fr(input_str) if input_str == "trentième": return 1.0 / 30 if input_str == "centième": return 1.0 / 100 if input_str == "millième": return 1.0 / 1000 return False def normalize_fr(text, remove_articles=True): """ French string normalization """ text = text.lower() words = text.split() # this also removed extra spaces normalized = "" i = 0 while i < len(words): # remove articles if remove_articles and words[i] in _ARTICLES_FR: i += 1 continue if remove_articles and words[i][:2] in ["l'", "d'"]: words[i] = words[i][2:] # remove useless punctuation signs if words[i] in ["?", "!", ";", "…"]: i += 1 continue # Normalize ordinal numbers if i > 0 and words[i - 1] in _ARTICLES_FR: result = _number_ordinal_fr(words, i) if result is not None: val, i = result normalized += " " + str(val) continue # Convert numbers into digits result = _number_parse_fr(words, i) if result is not None: val, i = result normalized += " " + str(val) continue normalized += " " + words[i] i += 1 return normalized[1:] # strip the initial space def extract_numbers_fr(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) class FrenchNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_hu.py000066400000000000000000000016411426211343400237250ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from lingua_franca.time import now_local from lingua_franca.lang.parse_common import Normalizer class HungarianNormalizer(Normalizer): """ TODO implement language specific normalizer""" def normalize_hu(text, remove_articles=True): """ English string normalization """ return HungarianNormalizer().normalize(text, remove_articles) lingua-franca-release-v0.4.3/lingua_franca/lang/parse_it.py000066400000000000000000001242141426211343400237270ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parse functions for Italian (IT-IT) """ import collections from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic, Normalizer from lingua_franca.lang.format_it import _LONG_SCALE_IT, _SHORT_SCALE_IT, \ pronounce_number_it from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT def is_fractional_it(input_str, short_scale=False): """ This function takes the given text and checks if it is a fraction. Updated to italian from en version 18.8.9 Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ input_str = input_str.lower() if input_str.endswith('i', -1) and len(input_str) > 2: input_str = input_str[:-1] + "o" # normalizza plurali fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2} if short_scale: for num in _SHORT_ORDINAL_STRING_IT: if num > 2: fracts_it[_SHORT_ORDINAL_STRING_IT[num]] = num else: for num in _LONG_ORDINAL_STRING_IT: if num > 2: fracts_it[_LONG_ORDINAL_STRING_IT[num]] = num if input_str in fracts_it: return 1.0 / fracts_it[input_str] return False def _extract_number_long_it(word): """ This function converts a long textual number like milleventisette -> 1027 diecimila -> 10041 in integer value, covers from 0 to 999999999999999 for now limited to 999_e21 but ready for 999_e63 example: milleventisette -> 1027 diecimilaquarantuno-> 10041 centottomiladuecentotredici -> 108213 Args: word (str): the word to convert in number Returns: (bool) or (int): The extracted number or False if no number was found """ units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4, 'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9} tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40, 'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80, 'novanta': 90} tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50, 'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90} nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14, 'quindici': 15, 'sedici': 16, 'diciassette': 17, 'diciotto': 18, 'diciannove': 19} multipli_it = collections.OrderedDict([ # (1e63, 'deciliardi'), # (1e60, 'decilioni'), # (1e57, 'noviliardi'), # (1e54, 'novilioni'), # (1e51, 'ottiliardi'), # (1e48, 'ottilioni'), # (1e45, 'settiliardi'), # (1e42, 'settilioni'), # (1e39, 'sestiliardi'), # (1e36, 'sestilioni'), # (1e33, 'quintiliardi'), # (1e30, 'quintilioni'), # (1e27, 'quadriliardi'), # (1e24, 'quadrilioni'), # yotta (1e21, 'triliardi'), # zetta (1e18, 'trilioni'), # exa (1e15, 'biliardi'), # peta (1e12, 'bilioni'), # tera (1e9, 'miliardi'), # giga (1e6, 'milioni') # mega ]) multiplier = {} un_multiplier = {} for num in multipli_it: if num > 1000 and num <= 1e21: # plurali multiplier[multipli_it[num]] = int(num) # singolari - modificare per eccezioni *liardo if multipli_it[num][-5:-1] == 'iard': un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num) else: un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num) value = False # normalizza ordinali singoli o plurali -esimo -esimi if word[-5:-1] == 'esim': base = word[:-5] normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'} normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e', 'ov': 'e'} if base[-3:] in normalize_ita3: base += normalize_ita3[base[-3:]] elif base[-2:] in normalize_ita2: base += normalize_ita2[base[-2:]] word = base for item in un_multiplier: components = word.split(item, 1) if len(components) == 2: if not components[0]: # inizia con un1^x if not components[1]: # unmilione word = str(int(un_multiplier[item])) else: # unmilione + x word = str(int(un_multiplier[item]) + _extract_number_long_it(components[1])) for item in multiplier: components = word.split(item, 1) if len(components) == 2: if not components[0]: # inizia con un1^x word = str(int(multiplier[item]) + _extract_number_long_it(components[1])) else: if not components[1]: word = str(_extract_number_long_it(components[0])) + '*' \ + str(int(multiplier[item])) else: word = str(_extract_number_long_it(components[0])) + '*' \ + str(int(multiplier[item])) + '+' \ + str(_extract_number_long_it(components[1])) for item in tens: word = word.replace(item, '+' + str(tens[item])) for item in tens_short: word = word.replace(item, '+' + str(tens_short[item])) for item in nums_long: word = word.replace(item, '+' + str(nums_long[item])) word = word.replace('cento', '+1xx') word = word.replace('cent', '+1xx') word = word.replace('mille', '+1000') # unmilionemille word = word.replace('mila', '*1000') # unmilioneduemila for item in units: word = word.replace(item, '+' + str(units[item])) # normalizzo i cento occorrenze = word.count('+1xx') for _ in range(0, occorrenze): components = word.rsplit('+1xx', 1) if len(components[0]) > 1 and components[0].endswith('0'): word = components[0] + '+100' + components[1] else: word = components[0] + '*100' + components[1] components = word.rsplit('*1000', 1) if len(components) == 2: if components[0].startswith('*'): # centomila components[0] = components[0][1:] word = str(_extract_number_long_it(components[0])) + \ '*1000' + str(components[1]) # gestione eccezioni if word.startswith('*') or word.startswith('+'): word = word[1:] addends = word.split('+') for c, _ in enumerate(addends): if '*' in addends[c]: factors = addends[c].split('*') result = int(factors[0]) * int(factors[1]) if len(factors) == 3: result *= int(factors[2]) addends[c] = str(result) # check if all token are numbers if all([s.isdecimal() for s in addends]): value = sum([int(s) for s in addends]) else: value = False return value def extract_number_it(text, short_scale=False, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ text = text.lower() string_num_ordinal_it = {} # first, second... if ordinals: if short_scale: for num in _SHORT_ORDINAL_STRING_IT: num_string = _SHORT_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num _STRING_NUM_IT[num_string] = num else: for num in _LONG_ORDINAL_STRING_IT: num_string = _LONG_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num _STRING_NUM_IT[num_string] = num # negate next number (-2 = 0 - 2) negatives = ['meno'] # 'negativo' non è usuale in italiano # multiply the previous number (one hundred = 1 * 100) multiplies = ['decina', 'decine', 'dozzina', 'dozzine', 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila'] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [' e '] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [' punto ', ' virgola '] if short_scale: for num in _SHORT_SCALE_IT: num_string = _SHORT_SCALE_IT[num] _STRING_NUM_IT[num_string] = num multiplies.append(num_string) else: for num in _LONG_SCALE_IT: num_string = _LONG_SCALE_IT[num] _STRING_NUM_IT[num_string] = num multiplies.append(num_string) # 2 e 3/4 ed altri casi for separator in fraction_marker: components = text.split(separator) zeros = 0 if len(components) == 2: # count zeros in fraction part sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break # ensure first is not a fraction and second is a fraction num1 = extract_number_it(components[0]) num2 = extract_number_it(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # sette e quaranta sette e zero zero due elif num1 is not None and num2 is not None \ and num1 >= 1 and num2 > 1: return num1 + num2 / pow(10, len(str(num2)) + zeros) # 2 punto 5 for separator in decimal_marker: zeros = 0 # count zeros in fraction part components = text.split(separator) if len(components) == 2: sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break number = int(extract_number_it(components[0])) decimal = int(extract_number_it(components[1])) if number is not None and decimal is not None: if '.' not in str(decimal): return number + decimal / pow(10, len(str(decimal)) + zeros) all_words = text.split() val = False prev_val = None to_sum = [] for idx, word in enumerate(all_words): if not word: continue prev_word = all_words[idx - 1] if idx > 0 else '' next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' # is this word already a number ? if is_numeric(word): val = float(word) # is this word the name of a number ? if word in _STRING_NUM_IT: val = _STRING_NUM_IT[word] # tre quarti un quarto trenta secondi if is_fractional_it(word) and prev_val: if word[:-1] == 'second' and not ordinals: val = prev_val * 2 else: val = prev_val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # mezza tazza if val is False: val = is_fractional_it(word, short_scale=short_scale) # 2 quinti if not ordinals: next_value = is_fractional_it(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val if not val: val = _extract_number_long_it(word) # let's make sure it isn't a fraction if not val: # look for fractions like '2/3' all_pieces = word.split('/') if look_for_fractions(all_pieces): val = float(all_pieces[0]) / float(all_pieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 elif _extract_number_long_it(word) > 100 and \ _extract_number_long_it(next_word) and \ next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for addend in to_sum: val = val + addend return val def normalize_it(text, remove_articles=True): """ IT string normalization """ # replace ambiguous words text = text.replace('un paio', 'due') words = text.split() # this also removed extra spaces # Contractions are not common in IT # Convert numbers into digits, e.g. 'quarantadue' -> '42' normalized = '' i = 0 while i < len(words): word = words[i] # remove articles # Italian requires the article to define the grammatical gender if remove_articles and word in _ARTICLES_IT: i += 1 continue if word in _STRING_NUM_IT: word = str(_STRING_NUM_IT[word]) val = int(extract_number_it(word)) # era extractnumber_long_it if val: word = str(val) normalized += ' ' + word i += 1 # indefinite articles in it-it can not be removed return normalized[1:] def extract_datetime_it(text, anchorDate=None, default_time=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. Normalize italian plurals """ symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\''] for word in symbols: s = s.replace(word, '') s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'") \ .replace('é', "e'").replace('ì', 'i').replace('ù', 'u') \ .replace('ò', 'o').replace('-', ' ').replace('_', '') # normalizza plurali per semplificare analisi s = s.replace('secondi', 'secondo').replace('minuti', 'minuto') \ .replace('ore', 'ora').replace('giorni', 'giorno') \ .replace('settimane', 'settimana').replace('mesi', 'mese') \ .replace('anni', 'anno').replace('mattino', 'mattina') \ .replace('prossima', 'prossimo').replace('questa', 'questo') \ .replace('quarti', 'quarto').replace('in punto', 'in_punto') \ .replace('decennio', 'decenni').replace('secoli', 'secolo') \ .replace('millennio', 'millenni').replace(' un ', ' uno ') \ .replace('scorsa', 'scorso').replace('passata', 'passato') \ .replace('uno paio', 'due') noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo', 'le', 'alle', 'alla', 'dai', 'delle', 'della', 'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel', 'nello', 'dallo', ' '] word_list = s.split() word_list = [x for x in word_list if x not in noise_words] # normalizza alcuni formati orari for idx in range(0, len(word_list) - 1): if word_list[idx][0].isdigit() and word_list[idx + 1][0].isdigit(): num0 = int(word_list[idx]) num1 = int(word_list[idx + 1]) if 0 <= num0 <= 23 and 10 <= num1 <= 59: word_list[idx] = str(num0) + ':' + str(num1) word_list[idx + 1] = '' word_list = [x for x in word_list if x] return word_list def date_found(): return found or \ (datestr != '' or time_str != '' or year_offset != 0 or month_offset != 0 or day_offset is True or hr_offset != 0 or hr_abs or min_offset != 0 or min_abs or sec_offset != 0) if text == '': return None anchorDate = anchorDate or now_local() found = False day_specified = False day_offset = False month_offset = 0 year_offset = 0 today = anchorDate.strftime('%w') current_year = anchorDate.strftime('%Y') from_flag = False datestr = '' has_year = False time_qualifier = '' time_qualifiers_am = ['mattina', 'stamani', 'stamane'] time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte'] time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro'] days = ['lunedi', 'martedi', 'mercoledi', 'giovedi', 'venerdi', 'sabato', 'domenica'] months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', 'dicembre'] months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', 'set', 'ott', 'nov', 'dic'] year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni time_multiples = ['ora', 'minuto', 'secondo'] day_multiples = ['settimana', 'mese', 'anno'] noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del', 'l', 'in_punto', ' ', 'nella', 'dell'] words = clean_string(text) for idx, word in enumerate(words): if word == '': continue word_prev_prev = words[idx - 2] if idx > 1 else '' word_prev = words[idx - 1] if idx > 0 else '' word_next = words[idx + 1] if idx + 1 < len(words) else '' word_next_next = words[idx + 2] if idx + 2 < len(words) else '' start = idx used = 0 # save timequalifier for later if word == 'adesso' and not datestr: # word == 'ora' va in conflitto con 'tra un ora' words = [x for x in words if x != 'adesso'] words = [x for x in words if x] result_str = ' '.join(words) extracted_date = anchorDate.replace(microsecond=0) return [extracted_date, result_str] # un paio di o tra tre settimane --> secoli elif extract_number_it(word) and (word_next in year_multiples or word_next in day_multiples): multiplier = int(extract_number_it(word)) used += 2 if word_next == 'decenni': year_offset = multiplier * 10 elif word_next == 'secolo': year_offset = multiplier * 100 elif word_next == 'millenni': year_offset = multiplier * 1000 elif word_next == 'anno': year_offset = multiplier elif word_next == 'mese': month_offset = multiplier elif word_next == 'settimana': day_offset = multiplier * 7 elif word in time_qualifiers_list: time_qualifier = word # parse today, tomorrow, day after tomorrow elif word == 'oggi' and not from_flag: day_offset = 0 used += 1 elif word == 'domani' and not from_flag: day_offset = 1 used += 1 elif word == 'ieri' and not from_flag: day_offset -= 1 used += 1 elif word == 'dopodomani' and not from_flag: # after tomorrow day_offset += 2 used += 1 elif word == 'dopo' and word_next == 'domani' and not from_flag: day_offset += 1 used += 2 elif word == 'giorno': if word_prev[0].isdigit(): day_offset += int(word_prev) start -= 1 used = 2 if word_next == 'dopo' and word_next_next == 'domani': day_offset += 1 used += 2 elif word == 'settimana' and not from_flag: if word_prev == 'prossimo': day_offset = 7 start -= 1 used = 2 elif word_prev == 'passato' or word_prev == 'scorso': day_offset = -7 start -= 1 used = 2 elif word_next == 'prossimo': day_offset = 7 used += 2 elif word_next == 'passato' or word_next == 'scorso': day_offset = -7 used += 2 # parse next month, last month elif word == 'mese' and not from_flag: if word_prev == 'prossimo': month_offset = 1 start -= 1 used = 2 elif word_prev == 'passato' or word_prev == 'scorso': month_offset = -1 start -= 1 used = 2 elif word_next == 'prossimo': month_offset = 1 used += 2 elif word_next == 'passato' or word_next == 'scorso': month_offset = -1 used += 2 # parse next year, last year elif word == 'anno' and not from_flag: if word_prev == 'prossimo': # prossimo anno year_offset = 1 start -= 1 used = 2 elif word_next == 'prossimo': # anno prossimo year_offset = 1 used = 2 elif word_prev == 'passato' or word_prev == 'scorso': year_offset = -1 start -= 1 used = 2 elif word_next == 'passato' or word_next == 'scorso': year_offset = -1 used = 2 elif word == 'decenni' and not from_flag: if word_prev == 'prossimo': # prossimo mese year_offset = 10 start -= 1 used = 2 elif word_next == 'prossimo': # mese prossimo year_offset = 10 used = 2 elif word_prev == 'passato' or word_prev == 'scorso': year_offset = -10 start -= 1 used = 2 elif word_next == 'passato' or word_next == 'scorso': year_offset = -10 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not from_flag: ddd = days.index(word) day_offset = (ddd + 1) - int(today) used = 1 if day_offset < 0: day_offset += 7 if word_prev == 'prossimo': day_offset += 7 start -= 1 used += 1 elif word_prev == 'passato' or word_prev == 'scorso': day_offset -= 7 start -= 1 used += 1 if word_next == 'prossimo': day_offset += 7 used += 1 elif word_next == 'passato' or word_next == 'scorso': day_offset -= 7 used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in months_short and not from_flag: try: mmm = months.index(word) except ValueError: mmm = months_short.index(word) used += 1 datestr = months[mmm] if word_prev and extract_number_it(word_prev): datestr += ' ' + str(int(extract_number_it(word_prev))) start -= 1 used += 1 if word_next and extract_number_it(word_next): datestr += ' ' + str(int(extract_number_it(word_next))) used += 1 has_year = True else: has_year = False elif word_next and word_next[0].isdigit(): datestr += ' ' + word_next used += 1 if word_next_next and word_next_next[0].isdigit(): datestr += ' ' + word_next_next used += 1 has_year = True else: has_year = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + months_short validFollowups.append('oggi') validFollowups.append('domani') validFollowups.append('prossimo') validFollowups.append('passato') validFollowups.append('adesso') if (word == 'da' or word == 'dopo') and word_next in validFollowups: used = 0 from_flag = True if word_next == 'domani': day_offset += 1 used += 2 elif word_next == 'oggi' or word_next == 'adesso': used += 2 elif word_next in days: ddd = days.index(word_next) tmp_offset = (ddd + 1) - int(today) used += 2 if tmp_offset < 0: tmp_offset += 7 if word_next_next == 'prossimo': tmp_offset += 7 used += 1 elif word_next_next == 'passato' or word_next_next == 'scorso': tmp_offset = (ddd + 1) - int(today) used += 1 day_offset += tmp_offset elif word_next_next and word_next_next in days: ddd = days.index(word_next_next) tmp_offset = (ddd + 1) - int(today) if word_next == 'prossimo': tmp_offset += 7 # elif word_next == 'passato' or word_next == 'scorso': # tmp_offset -= 7 day_offset += tmp_offset used += 3 if used > 0: if start - 1 > 0 and words[start - 1] == 'questo': start -= 1 used += 1 for i in range(0, used): words[i + start] = '' if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = '' found = True day_specified = True # parse time time_str = '' hr_offset = 0 min_offset = 0 sec_offset = 0 hr_abs = None min_abs = None military = False for idx, word in enumerate(words): if word == '': continue word_prev_prev = words[idx - 2] if idx > 1 else '' word_prev = words[idx - 1] if idx > 0 else '' word_next = words[idx + 1] if idx + 1 < len(words) else '' word_next_next = words[idx + 2] if idx + 2 < len(words) else '' # parse noon, midnight, morning, afternoon, evening used = 0 if word == 'mezzogiorno': hr_abs = 12 used += 1 elif word == 'mezzanotte': hr_abs = 24 used += 1 if word == 'mezzo' and word_next == 'giorno': hr_abs = 12 used += 2 elif word == 'mezza' and word_next == 'notte': hr_abs = 24 used += 2 elif word == 'mattina': if not hr_abs: hr_abs = 8 used += 1 if word_next and word_next[0].isdigit(): # mattina alle 5 hr_abs = int(word_next) used += 1 elif word == 'pomeriggio': if not hr_abs: hr_abs = 15 used += 1 if word_next and word_next[0].isdigit(): # pomeriggio alle 5 hr_abs = int(word_next) used += 1 if (hr_abs or 0) < 12: hr_abs = (hr_abs or 0) + 12 elif word == 'sera': if not hr_abs: hr_abs = 19 used += 1 if word_next and word_next[0].isdigit() \ and ':' not in word_next: hr_abs = int(word_next) used += 1 if (hr_abs or 0) < 12: hr_abs = (hr_abs or 0) + 12 # da verificare più a fondo elif word == 'presto': hr_abs -= 1 used += 1 elif word == 'tardi': hr_abs += 1 used += 1 # un paio di minuti tra cinque minuti tra 5 ore elif extract_number_it(word) and (word_next in time_multiples): d_time = int(extract_number_it(word)) used += 2 if word_next == 'ora': hr_offset = d_time isTime = False hr_abs = -1 min_abs = -1 elif word_next == 'minuto': min_offset = d_time isTime = False hr_abs = -1 min_abs = -1 elif word_next == 'secondo': sec_offset = d_time isTime = False hr_abs = -1 min_abs = -1 elif word == 'mezzora': min_offset = 30 used = 1 isTime = False hr_abs = -1 min_abs = -1 # if word_prev == 'uno' or word_prev == 'una': # start -= 1 # used += 1 elif extract_number_it(word) and word_next and \ word_next == 'quarto' and word_next_next == 'ora': if int(extract_number_it(word)) == 1 \ or int(extract_number_it(word)) == 3: min_offset = 15 * int(extract_number_it(word)) else: # elimina eventuali errori min_offset = 15 used = 3 start -= 1 isTime = False hr_abs = -1 min_abs = -1 elif word[0].isdigit(): isTime = True str_hh = '' str_mm = '' remainder = '' if ':' in word: # parse colons # '3:00 in the morning' components = word.split(':') if len(components) == 2: num0 = int(extract_number_it(components[0])) num1 = int(extract_number_it(components[1])) if num0 is not False and num1 is not False \ and 0 <= num0 <= 23 and 0 <= num1 <= 59: str_hh = str(num0) str_mm = str(num1) elif 0 < int(extract_number_it(word)) < 24 \ and word_next != 'quarto': str_hh = str(int(word)) str_mm = '00' elif 100 <= int(word) <= 2400: str_hh = int(word) / 100 str_mm = int(word) - str_hh * 100 military = True isTime = False if extract_number_it(word) and word_next \ and word_next == 'quarto' and word_next_next != 'ora': if int(extract_number_it(word)) == 1 \ or int(extract_number_it(word)) == 3: str_mm = str(15 * int(extract_number_it(word))) else: # elimina eventuali errori str_mm = '0' str_hh = str(hr_abs) used = 2 words[idx + 1] = '' isTime = False if extract_number_it(word) and word_next \ and word_next == 'in_punto': str_hh = str(int(extract_number_it(word))) used = 2 if word_next == 'pm': remainder = 'pm' hr_abs = int(str_hh) min_abs = int(str_mm) if hr_abs <= 12: hr_abs = hr_abs + 12 used = 2 elif word_next == 'am': remainder = 'am' hr_abs = int(str_hh) min_abs = int(str_mm) used = 2 elif word_next == 'mattina': # ' 11 del mattina' hh = int(str_hh) mm = int(str_mm) used = 2 remainder = 'am' isTime = False hr_abs = hh min_abs = mm elif word_next == 'pomeriggio': # ' 2 del pomeriggio' hh = int(str_hh) mm = int(str_mm) if hh < 12: hh += 12 used = 2 remainder = 'pm' isTime = False hr_abs = hh min_abs = mm elif word_next == 'sera': # 'alle 8 di sera' hh = int(str_hh) mm = int(str_mm) if hh < 12: hh += 12 used = 2 remainder = 'pm' isTime = False hr_abs = hh min_abs = mm elif word_next == 'notte': hh = int(str_hh) mm = int(str_mm) if hh > 5: remainder = 'pm' else: remainder = 'am' used = 2 isTime = False hr_abs = hh min_abs = mm # parse half an hour : undici e mezza elif word_next and word_next == 'mezza': hr_abs = int(str_hh) min_abs = 30 used = 2 isTime = False elif word_next and word_next == 'in_punto': hr_abs = int(str_hh) min_abs = 0 str_mm = '0' used = 2 isTime = False else: # 17:30 remainder = '' hr_abs = int(str_hh) min_abs = int(str_mm) used = 1 isTime = False if word_prev == 'ora': words[idx - 1] = '' if time_qualifier != '': # military = True if str_hh and int(str_hh) <= 12 and \ (time_qualifier in time_qualifiers_pm): str_hh = str(int(str_hh) + 12) else: isTime = False str_hh = int(str_hh) if str_hh else 0 str_mm = int(str_mm) if str_mm else 0 str_hh = str_hh + 12 if remainder == 'pm' \ and str_hh < 12 else str_hh str_hh = str_hh - 12 if remainder == 'am' \ and str_hh >= 12 else str_hh if (not military and remainder not in ['am', 'pm'] and ((not day_specified) or day_offset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed hr_abs = str_hh if anchorDate.hour < str_hh: pass # No modification needed elif anchorDate.hour < str_hh + 12: str_hh += 12 hr_abs = str_hh else: # has passed, assume the next morning day_offset += 1 if time_qualifier in time_qualifiers_pm and str_hh < 12: str_hh += 12 if str_hh > 24 or str_mm > 59: isTime = False used = 0 if isTime: hr_abs = str_hh * 1 min_abs = str_mm * 1 used += 1 if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or time_qualifier == 'pomeriggio'): hr_abs = (hr_abs or 0) + 12 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = '' if word_prev == 'o' or word_prev == 'oh': words[words.index(word_prev)] = '' if idx > 0 and word_prev in markers: words[idx - 1] = '' if idx > 1 and word_prev_prev in markers: words[idx - 2] = '' idx += used - 1 found = True # check that we found a date if not date_found: return None if day_offset is False: day_offset = 0 # perform date manipulation extracted_date = anchorDate.replace(microsecond=0) if datestr != '': en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_months_short): datestr = datestr.replace(months_short[idx], en_month) try: temp = datetime.strptime(datestr, '%B %d') except ValueError: # Try again, allowing the year temp = datetime.strptime(datestr, '%B %d %Y') extracted_date = extracted_date.replace(hour=0, minute=0, second=0) if not has_year: temp = temp.replace(year=extracted_date.year, tzinfo=extracted_date.tzinfo) if extracted_date < temp: extracted_date = extracted_date.replace( year=int(current_year), month=int(temp.strftime('%m')), day=int(temp.strftime('%d')), tzinfo=extracted_date.tzinfo) else: extracted_date = extracted_date.replace( year=int(current_year) + 1, month=int(temp.strftime('%m')), day=int(temp.strftime('%d')), tzinfo=extracted_date.tzinfo) else: extracted_date = extracted_date.replace( year=int(temp.strftime('%Y')), month=int(temp.strftime('%m')), day=int(temp.strftime('%d')), tzinfo=extracted_date.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hr_offset == 0 and min_offset == 0 and sec_offset == 0: extracted_date = extracted_date.replace(hour=0, minute=0, second=0) if year_offset != 0: extracted_date = extracted_date + relativedelta(years=year_offset) if month_offset != 0: extracted_date = extracted_date + relativedelta(months=month_offset) if day_offset != 0: extracted_date = extracted_date + relativedelta(days=day_offset) if hr_abs != -1 and min_abs != -1: # If no time was supplied in the string set the time to default # time if it's available if hr_abs is None and min_abs is None and default_time is not None: hr_abs, min_abs = default_time.hour, default_time.minute else: hr_abs = hr_abs or 0 min_abs = min_abs or 0 extracted_date = extracted_date + relativedelta(hours=hr_abs, minutes=min_abs) if (hr_abs != 0 or min_abs != 0) and datestr == '': if not day_specified and anchorDate > extracted_date: extracted_date = extracted_date + relativedelta(days=1) if hr_offset != 0: extracted_date = extracted_date + relativedelta(hours=hr_offset) if min_offset != 0: extracted_date = extracted_date + relativedelta(minutes=min_offset) if sec_offset != 0: extracted_date = extracted_date + relativedelta(seconds=sec_offset) words = [x for x in words if x not in noise_words_2] words = [x for x in words if x] result_str = ' '.join(words) return [extracted_date, result_str] def get_gender_it(word, context=""): """ In Italian to define the grammatical gender of a word is necessary analyze the article that precedes the word and not only the last letter of the word. """ gender = None words = context.split(' ') for idx, w in enumerate(words): if w == word and idx != 0: previous = words[idx - 1] gender = get_gender_it(previous) break if not gender: if word[-1] == 'a' or word[-1] == 'e': gender = 'f' if word[-1] == 'o' or word[-1] == 'n' \ or word[-1] == 'l' or word[-1] == 'i': gender = 'm' return gender def extract_numbers_it(text, short_scale=False, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) class ItalianNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_nl.py000066400000000000000000001421561426211343400237310ustar00rootroot00000000000000# # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from .parse_common import is_numeric, look_for_fractions, Token, \ ReplaceableNumber, tokenize, partition_list, Normalizer, invert_dict from .common_data_nl import _SHORT_ORDINAL_STRING_NL, _ARTICLES_NL, \ _DECIMAL_MARKER_NL, _FRACTION_MARKER_NL, _LONG_ORDINAL_STRING_NL,\ _LONG_SCALE_NL, _MULTIPLIES_LONG_SCALE_NL, _MULTIPLIES_SHORT_SCALE_NL,\ _NEGATIVES_NL, _SHORT_SCALE_NL, _STRING_LONG_ORDINAL_NL, _STRING_NUM_NL, \ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_franca.time import now_local import re def _convert_words_to_numbers_nl(text, short_scale=True, ordinals=False): """Convert words in a string into their equivalent numbers. Args: text str: short_scale boolean: True if short scale numbers should be used. ordinals boolean: True if ordinals (e.g. first, second, third) should be parsed to their number values (1, 2, 3...) Returns: str The original text, with numbers subbed in where appropriate. """ text = text.lower() tokens = tokenize(text) numbers_to_replace = \ _extract_numbers_with_text_nl(tokens, short_scale, ordinals) numbers_to_replace.sort(key=lambda number: number.start_index) results = [] for token in tokens: if not numbers_to_replace or \ token.index < numbers_to_replace[0].start_index: results.append(token.word) else: if numbers_to_replace and \ token.index == numbers_to_replace[0].start_index: results.append(str(numbers_to_replace[0].value)) if numbers_to_replace and \ token.index == numbers_to_replace[0].end_index: numbers_to_replace.pop(0) return ' '.join(results) def _extract_numbers_with_text_nl(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """Extract all numbers from a list of _Tokens, with the representing words. Args: [Token]: The tokens to parse. short_scale bool: True if short scale numbers should be used, False for long scale. True by default. ordinals bool: True if ordinal words (first, second, third, etc) should be parsed. fractional_numbers bool: True if we should look for fractions and decimals. Returns: [_ReplaceableNumber]: A list of tuples, each containing a number and a string. """ placeholder = "" # inserted to maintain correct indices results = [] while True: to_replace = \ _extract_number_with_text_nl(tokens, short_scale, ordinals, fractional_numbers) if not to_replace: break results.append(to_replace) tokens = [ t if not to_replace.start_index <= t.index <= to_replace.end_index else Token(placeholder, t.index) for t in tokens ] results.sort(key=lambda n: n.start_index) return results def _extract_number_with_text_nl(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """This function extracts a number from a list of _Tokens. Args: tokens str: the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and decimals. Returns: _ReplaceableNumber """ number, tokens = \ _extract_number_with_text_nl_helper(tokens, short_scale, ordinals, fractional_numbers) while tokens and tokens[0].word in _ARTICLES_NL: tokens.pop(0) return ReplaceableNumber(number, tokens) def _extract_number_with_text_nl_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """Helper for _extract_number_with_text_nl. This contains the real logic for parsing, but produces a result that needs a little cleaning (specific, it may contain leading articles that can be trimmed off). Args: tokens [Token]: short_scale boolean: ordinals boolean: fractional_numbers boolean: Returns: int or float, [_Tokens] """ if fractional_numbers: fraction, fraction_text = \ _extract_fraction_with_text_nl(tokens, short_scale, ordinals) if fraction: return fraction, fraction_text decimal, decimal_text = \ _extract_decimal_with_text_nl(tokens, short_scale, ordinals) if decimal: return decimal, decimal_text return _extract_whole_number_with_text_nl(tokens, short_scale, ordinals) def _extract_fraction_with_text_nl(tokens, short_scale, ordinals): """Extract fraction numbers from a string. This function handles text such as '2 and 3/4'. Note that "one half" or similar will be parsed by the whole number function. Args: tokens [Token]: words and their indexes in the original string. short_scale boolean: ordinals boolean: Returns: (int or float, [Token]) The value found, and the list of relevant tokens. (None, None) if no fraction value is found. """ for c in _FRACTION_MARKER_NL: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_nl(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_nl(partitions[2], short_scale, ordinals, fractional_numbers=True) if not numbers1 or not numbers2: return None, None # ensure first is not a fraction and second is a fraction num1 = numbers1[-1] num2 = numbers2[0] if num1.value >= 1 and 0 < num2.value < 1: return num1.value + num2.value, \ num1.tokens + partitions[1] + num2.tokens return None, None def _extract_decimal_with_text_nl(tokens, short_scale, ordinals): """Extract decimal numbers from a string. This function handles text such as '2 point 5'. Notes: While this is a helper for extractnumber_nl, it also depends on extractnumber_nl, to parse out the components of the decimal. This does not currently handle things like: number dot number number number Args: tokens [Token]: The text to parse. short_scale boolean: ordinals boolean: Returns: (float, [Token]) The value found and relevant tokens. (None, None) if no decimal value is found. """ for c in _DECIMAL_MARKER_NL: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_nl(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_nl(partitions[2], short_scale, ordinals, fractional_numbers=False) if not numbers1 or not numbers2: return None, None number = numbers1[-1] decimal = numbers2[0] # TODO handle number dot number number number if "." not in str(decimal.text): return number.value + float('0.' + str(decimal.value)), \ number.tokens + partitions[1] + decimal.tokens return None, None def _extract_whole_number_with_text_nl(tokens, short_scale, ordinals): """Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [Token]: short_scale boolean: ordinals boolean: Returns: int or float, [_Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data_nl(short_scale) number_words = [] # type: [Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word if word in _ARTICLES_NL or word in _NEGATIVES_NL: number_words.append(token) continue prev_word = tokens[idx - 1].word if idx > 0 else "" next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" if word not in string_num_scale and \ word not in _STRING_NUM_NL and \ word not in _SUMS_NL and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not is_fractional_nl(word, short_scale=short_scale) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] if number_words and not all([w in _ARTICLES_NL | _NEGATIVES_NL for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SUMS_NL \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES_NL \ and prev_word not in _ARTICLES_NL: number_words = [token] elif prev_word in _SUMS_NL and word in _SUMS_NL: number_words = [token] else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_NL: val = _STRING_NUM_NL.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal and val == 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if prev_word in _SUMS_NL and val and val < 10: val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = is_fractional_nl(word, short_scale=short_scale) current_val = val # 2 fifths if not ordinals: next_val = is_fractional_nl(next_word, short_scale=short_scale) if next_val: if not val: val = 1 val = val * next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES_NL: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) current_val = val else: if prev_word in _SUMS_NL and word not in _SUMS_NL and current_val >= 10: # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words def _initialize_number_data_nl(short_scale): """Generate dictionaries of words to numbers, based on scale. This is a helper function for _extract_whole_number. Args: short_scale boolean: Returns: (set(str), dict(str, number), dict(str, number)) multiplies, string_num_ordinal, string_num_scale """ multiplies = _MULTIPLIES_SHORT_SCALE_NL if short_scale \ else _MULTIPLIES_LONG_SCALE_NL string_num_ordinal_nl = _STRING_SHORT_ORDINAL_NL if short_scale \ else _STRING_LONG_ORDINAL_NL string_num_scale_nl = _SHORT_SCALE_NL if short_scale else _LONG_SCALE_NL string_num_scale_nl = invert_dict(string_num_scale_nl) return multiplies, string_num_ordinal_nl, string_num_scale_nl def extract_number_nl(text, short_scale=True, ordinals=False): """Extract a number from a text string The function handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value def extract_duration_nl(text): """Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None time_units = { 'microseconds': 0, 'milliseconds': 0, 'seconds': 0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0 } nl_translations = { 'microseconds': ["microsecond", "microseconde", "microseconden", "microsecondje", "microsecondjes"], 'milliseconds': ["millisecond", "milliseconde", "milliseconden", "millisecondje", "millisecondjes"], 'seconds': ["second", "seconde", "seconden", "secondje", "secondjes"], 'minutes': ["minuut", "minuten", "minuutje", "minuutjes"], 'hours': ["uur", "uren", "uurtje", "uurtjes"], 'days': ["dag", "dagen", "dagje", "dagjes"], 'weeks': ["week", "weken", "weekje", "weekjes"] } pattern = r"(?P\d+(?:\.?\d+)?)\s+{unit}" text = _convert_words_to_numbers_nl(text) for unit in time_units: unit_nl_words = nl_translations[unit] unit_nl_words.sort(key=len, reverse=True) for unit_nl in unit_nl_words: unit_pattern = pattern.format(unit=unit_nl) matches = re.findall(unit_pattern, text) value = sum(map(float, matches)) time_units[unit] = time_units[unit] + value text = re.sub(unit_pattern, '', text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def extract_datetime_nl(text, anchorDate=None, default_time=None): """Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: text (str): string containing date words dateNow (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ def clean_string(s): # clean unneeded punctuation and capitalization among other things. s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' de ', ' ').replace(' het ', ' ').replace(' het ', ' ') \ .replace("paar", "2").replace("eeuwen", "eeuw") \ .replace("decennia", "decennium") \ .replace("millennia", "millennium") wordList = s.split() for idx, word in enumerate(wordList): ordinals = ["ste", "de"] if word[0].isdigit(): for ordinal in ordinals: # "second" is the only case we should not do this if ordinal in word and "second" not in word: word = word.replace(ordinal, "") wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 today = anchorDate.strftime("%w") currentYear = anchorDate.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersAM = ['ochtend'] timeQualifiersPM = ['middag', 'avond', 'nacht'] timeQualifiersList = timeQualifiersAM + timeQualifiersPM timeQualifierOffsets = [8, 15, 19, 0] markers = ['op', 'in', 'om', 'tegen', 'over', 'deze', 'rond', 'voor', 'van', "binnen"] days = ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", "zaterdag", "zondag"] day_parts = [a + b for a in days for b in timeQualifiersList] months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december'] recur_markers = days + [d+'en' for d in days] + ['weekeinde', 'werkdag', 'weekeinden', 'werkdagen'] months_short = ['jan', 'feb', 'mar', 'apr', 'mei', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec'] year_multiples = ["decennium", "eeuw", "millennium"] day_multiples = ["dagen", "weken", "maanden", "jaren"] words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" start = idx used = 0 # save timequalifier for later if word == "nu" and not datestr: resultStr = " ".join(words[idx + 1:]) resultStr = ' '.join(resultStr.split()) extractedDate = anchorDate.replace(microsecond=0) return [extractedDate, resultStr] elif wordNext in year_multiples: multiplier = None if is_numeric(word): multiplier = extract_number_nl(word) multiplier = multiplier or 1 multiplier = int(multiplier) used += 2 if wordNext == "decennium": yearOffset = multiplier * 10 elif wordNext == "eeuw": yearOffset = multiplier * 100 elif wordNext == "millennium": yearOffset = multiplier * 1000 # paar elif word == "2" and \ wordNextNext in year_multiples: multiplier = 2 used += 2 if wordNextNext == "decennia": yearOffset = multiplier * 10 elif wordNextNext == "eeuwen": yearOffset = multiplier * 100 elif wordNextNext == "millennia": yearOffset = multiplier * 1000 elif word == "2" and \ wordNextNext in day_multiples: multiplier = 2 used += 2 if wordNextNext == "jaren": yearOffset = multiplier elif wordNextNext == "maanden": monthOffset = multiplier elif wordNextNext == "weken": dayOffset = multiplier * 7 elif word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "vandaag" and not fromFlag: dayOffset = 0 used += 1 elif word == "morgen" and not fromFlag: dayOffset = 1 used += 1 elif word == "overmorgen" and not fromFlag: dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "dag" or word == "dagen": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "week" or word == "weken" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "volgende": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "vorige": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "maand" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "volgende": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "vorige": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "jaar" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "volgend": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "vorig": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "volgende": if dayOffset <= 2: dayOffset += 7 used += 1 start -= 1 elif wordPrev == "vorige": dayOffset -= 7 used += 1 start -= 1 elif word in day_parts and not fromFlag: d = day_parts.index(word) / len(timeQualifiersList) dayOffset = (d + 1) - int(today) if dayOffset < 0: dayOffset += 7 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in months_short and not fromFlag: try: m = months.index(word) except ValueError: m = months_short.index(word) used += 1 datestr = months[m] if wordPrev and \ (wordPrev[0].isdigit() or (wordPrev == "van" and wordPrevPrev[0].isdigit())): if wordPrev == "van" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + months_short validFollowups.append("vandaag") validFollowups.append("morgen") validFollowups.append("volgende") validFollowups.append("vorige") validFollowups.append("nu") if (word == "van" or word == "na") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "morgen": dayOffset += 1 elif wordNext == "overmorgen": dayOffset += 2 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "volgende": if dayOffset <= 2: tmpOffset += 7 used += 1 start -= 1 elif wordNext == "vorige": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] == "deze": start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # parse nacht ochtend, middag, avond used = 0 if word.startswith("gister"): dayOffset = -1 elif word.startswith("morgen"): dayOffset = 1 if word.endswith("nacht"): if hrAbs is None: hrAbs = 0 used += 1 elif word.endswith("ochtend"): if hrAbs is None: hrAbs = 8 used += 1 elif word.endswith("middag"): if hrAbs is None: hrAbs = 15 used += 1 elif word.endswith("avond"): if hrAbs is None: hrAbs = 19 used += 1 # "paar" time_unit elif word == "2" and \ wordNextNext in ["uur", "minuten", "seconden"]: used += 2 if wordNextNext == "uur": hrOffset = 2 elif wordNextNext == "minuten": minOffset = 2 elif wordNextNext == "seconden": secOffset = 2 # parse half an hour, quarter hour elif word == "uur" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev == "half": minOffset = 30 elif wordPrev == "kwartier": minOffset = 15 elif wordPrevPrev == "kwartier": minOffset = 15 if idx > 2 and words[idx - 3] in markers: words[idx - 3] = "" if words[idx - 3] == "deze": daySpecified = True words[idx - 2] = "" elif wordPrev == "binnen": hrOffset = 1 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "deze": daySpecified = True words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc # parse "over een minuut" elif word == "minuut" and wordPrev == "over": minOffset = 1 words[idx - 1] = "" used += 1 # parse "over een seconde" elif word == "seconde" and wordPrev == "over": secOffset = 1 words[idx - 1] = "" used += 1 elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if wordNext == "vannacht" or wordNextNext == "vannacht" or \ wordPrev == "vannacht" or wordPrevPrev == "vannacht" or \ wordNextNextNext == "vannacht": remainder = "pm" used += 1 if wordPrev == "vannacht": words[idx - 1] = "" if wordPrevPrev == "vannacht": words[idx - 2] = "" if wordNextNext == "vannacht": used += 1 if wordNextNextNext == "vannacht": used += 1 if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "in" and wordNextNext == "ochtend": remainder = "am" used += 2 elif wordNext == "in" and wordNextNext == "middag": remainder = "pm" used += 2 elif wordNext == "in" and wordNextNext == "avond": remainder = "pm" used += 2 elif wordNext == "'s" and wordNextNext == "ochtends": remainder = "am" used += 2 elif wordNext == "'s" and wordNextNext == "middags": remainder = "pm" used += 2 elif wordNext == "'s" and wordNextNext == "avonds": remainder = "pm" used += 2 elif wordNext == "deze" and wordNextNext == "ochtend": remainder = "am" used = 2 daySpecified = True elif wordNext == "deze" and wordNextNext == "middag": remainder = "pm" used = 2 daySpecified = True elif wordNext == "deze" and wordNextNext == "avond": remainder = "pm" used = 2 daySpecified = True elif wordNext == "'s" and wordNextNext == "nachts": if strHH and int(strHH) > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": military = True if strHH and int(strHH) <= 12 and \ (timeQualifier in timeQualifiersPM): strHH += str(int(strHH) + 12) else: # try to parse numbers without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 elif ( remainder in recur_markers or wordNext in recur_markers or wordNextNext in recur_markers): # Ex: "7 on mondays" or "3 this friday" # Set strHH so that isTime == True # when am or pm is not specified strHH = strNum used = 1 else: if ( (wordNext == "uren" or wordNext == "uur" or remainder == "uren" or remainder == "uur") and word[0] != '0' and ( int(strNum) < 100 or int(strNum) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minuten" or wordNext == "minuut" or \ remainder == "minuten" or remainder == "minuut": # "in 10 minutes" minOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "seconden" or wordNext == "seconde" \ or remainder == "seconden" or \ remainder == "seconde": # in 5 seconds secOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(strNum) > 100: # military time, eg. "3300 hours" strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if wordNext == "uur" or remainder == "uur": used += 1 elif wordNext and wordNext[0].isdigit(): # military time, e.g. "04 38 hours" strHH = strNum strMM = wordNext military = True used += 1 if (wordNextNext == "uur" or remainder == "uur"): used += 1 elif ( wordNext == "" or wordNext == "uur" or ( wordNext == "in" and ( wordNextNext == "de" or wordNextNext == timeQualifier ) ) or wordNext == 'vannacht' or wordNextNext == 'vannacht'): strHH = strNum strMM = "00" if wordNext == "uur": used += 1 if wordNext == "in" or wordNextNext == "in": used += (1 if wordNext == "in" else 2) wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if (wordNextNext and (wordNextNext in timeQualifier or wordNextNextNext in timeQualifier)): if (wordNextNext in timeQualifiersPM or wordNextNextNext in timeQualifiersPM): remainder = "pm" used += 1 if (wordNextNext in timeQualifiersAM or wordNextNextNext in timeQualifiersAM): remainder = "am" used += 1 if timeQualifier != "": if timeQualifier in timeQualifiersPM: remainder = "pm" used += 1 elif timeQualifier in timeQualifiersAM: remainder = "am" used += 1 else: # TODO: Unsure if this is 100% accurate used += 1 military = True else: isTime = False HH = int(strHH) if strHH else 0 MM = int(strMM) if strMM else 0 HH = HH + 12 if remainder == "pm" and HH < 12 else HH HH = HH - 12 if remainder == "am" and HH >= 12 else HH if (not military and remainder not in ['am', 'pm', 'uren', 'minuten', "seconde", "seconden", "uur", "minuut"] and ((not daySpecified) or dayOffset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed if anchorDate.hour < HH or (anchorDate.hour == HH and anchorDate.minute < MM): pass # No modification needed elif anchorDate.hour < HH + 12: HH += 12 else: # has passed, assume the next morning dayOffset += 1 if timeQualifier in timeQualifiersPM and HH < 12: HH += 12 if HH > 24 or MM > 59: isTime = False used = 0 if isTime: hrAbs = HH minAbs = MM used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): if idx + i >= len(words): break words[idx + i] = "" if wordPrev == "vroeg": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "laat": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if wordPrev == "deze": daySpecified = True if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "deze": daySpecified = True idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = anchorDate.replace(microsecond=0) if datestr != "": # date included an explicit date, e.g. "june 5" or "june 2, 2017" try: temp = datetime.strptime(datestr, "%B %d") except ValueError: # Try again, allowing the year temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if not hasYear: temp = temp.replace(year=extractedDate.year, tzinfo=extractedDate.tzinfo) if extractedDate < temp: extractedDate = extractedDate.replace( year=int(currentYear), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hrOffset == 0 and minOffset == 0 and secOffset == 0: extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: # If no time was supplied in the string set the time to default # time if it's available if hrAbs is None and minAbs is None and default_time is not None: hrAbs, minAbs = default_time.hour, default_time.minute else: hrAbs = hrAbs or 0 minAbs = minAbs or 0 extractedDate = extractedDate.replace(hour=hrAbs, minute=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and anchorDate > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "en" and \ words[idx - 1] == "" and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_nl(input_str, short_scale=True): """This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ fracts = {"heel": 1, "half": 2, "halve": 2, "kwart": 4} if short_scale: for num in _SHORT_ORDINAL_STRING_NL: if num > 2: fracts[_SHORT_ORDINAL_STRING_NL[num]] = num else: for num in _LONG_ORDINAL_STRING_NL: if num > 2: fracts[_LONG_ORDINAL_STRING_NL[num]] = num if input_str.lower() in fracts: return 1.0 / fracts[input_str.lower()] return False def extract_numbers_nl(text, short_scale=True, ordinals=False): """Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] def normalize_nl(text, remove_articles=True): """Dutch string normalization.""" words = text.split() # this also removed extra spaces normalized = "" for word in words: if remove_articles and word in _ARTICLES_NL: continue # Convert numbers into digits, e.g. "two" -> "2" textNumbers = ["nul", "een", "twee", "drie", "vier", "vijf", "zes", "zeven", "acht", "negen", "tien", "elf", "twaalf", "dertien", "veertien", "vijftien", "zestien", "zeventien", "achttien", "negentien", "twintig"] if word in textNumbers: word = str(textNumbers.index(word)) normalized += " " + word return normalized[1:] # strip the initial space class DutchNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/lang/parse_pl.py000066400000000000000000001450221426211343400237260ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _FRACTION_STRING_PL, _TIME_UNITS_CONVERSION, \ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL from lingua_franca.time import now_local import re def generate_plurals_pl(originals): """ Return a new set or dict containing the plural form of the original values, In English this means all with 's' appended to them. Args: originals set(str) or dict(str, any): values to pluralize Returns: set(str) or dict(str, any) """ if isinstance(originals, dict): result = {key + 'y': value for key, value in originals.items()} result = {**result, **{key + 'ów': value for key, value in originals.items()}} result = {**result, **{'tysiące': 1000, 'tysięcy': 1000}} return result result = {value + "y" for value in originals} result = result.union({value + "ów" for value in originals}) result = result.union({'tysiące', 'tysięcy'}) return result def generate_fractions_pl(fractions): '''Returns a list of all fraction combinations. E.g.: trzecia, trzecich, trzecie czwarta, czwarte, czwartych :param fractions: Existing fractions :return: Fractions with add suffixes ''' result = {**fractions} for k, v in fractions.items(): k_no_last = k[:-1] result[k_no_last + 'e'] = v if k_no_last[-1:] == 'i': result[k_no_last + 'ch'] = v else: result[k_no_last + 'ych'] = v for k,v in _SHORT_ORDINAL_PL.items(): result[v[:-1] + 'a'] = k result['jedno'] = 1 result['czwartego'] = 4 return result # negate next number (-2 = 0 - 2) _NEGATIVES = {"ujemne", "minus"} # sum the next number (twenty two = 20 + 2) _SUMS = {'dwadzieścia', '20', 'trzydzieści', '30', 'czterdzieści', '40', 'pięćdziesiąt', '50', 'sześćdziesiąt', '60', 'siedemdziesiąt', '70', 'osiemdziesiąt', '80', 'dziewięćdziesiąt', '90'} _MULTIPLIES_SHORT_SCALE_PL = generate_plurals_pl(_SHORT_SCALE_PL.values()) # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) _FRACTION_MARKER = {'i'} # decimal marker ( 1 point 5 = 1 + 0.5) _DECIMAL_MARKER = {'kropka', 'przecinek'} _STRING_NUM_PL = invert_dict(_NUM_STRING_PL) _STRING_NUM_PL.update(generate_plurals_pl(_STRING_NUM_PL)) _STRING_NUM_PL.update({ 'pół': 0.5, 'połówka': 0.5, 'połowa': 0.5, }) _STRING_SHORT_ORDINAL_PL = invert_dict(_SHORT_ORDINAL_PL) _REV_FRACTITONS = generate_fractions_pl(invert_dict(_FRACTION_STRING_PL)) def _convert_words_to_numbers_pl(text, short_scale=True, ordinals=False): """ Convert words in a string into their equivalent numbers. Args: text str: short_scale boolean: True if short scale numbers should be used. ordinals boolean: True if ordinals (e.g. first, second, third) should be parsed to their number values (1, 2, 3...) Returns: str The original text, with numbers subbed in where appropriate. """ text = text.lower() tokens = tokenize(text) numbers_to_replace = \ _extract_numbers_with_text_pl(tokens, short_scale, ordinals) numbers_to_replace.sort(key=lambda number: number.start_index) results = [] for token in tokens: if not numbers_to_replace or \ token.index < numbers_to_replace[0].start_index: results.append(token.word) else: if numbers_to_replace and \ token.index == numbers_to_replace[0].start_index: results.append(str(numbers_to_replace[0].value)) if numbers_to_replace and \ token.index == numbers_to_replace[0].end_index: numbers_to_replace.pop(0) return ' '.join(results) def _extract_numbers_with_text_pl(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Extract all numbers from a list of Tokens, with the words that represent them. Args: [Token]: The tokens to parse. short_scale bool: True if short scale numbers should be used, False for long scale. True by default. ordinals bool: True if ordinal words (first, second, third, etc) should be parsed. fractional_numbers bool: True if we should look for fractions and decimals. Returns: [ReplaceableNumber]: A list of tuples, each containing a number and a string. """ placeholder = "" # inserted to maintain correct indices results = [] while True: to_replace = \ _extract_number_with_text_pl(tokens, short_scale, ordinals, fractional_numbers) if not to_replace: break results.append(to_replace) tokens = [ t if not to_replace.start_index <= t.index <= to_replace.end_index else Token(placeholder, t.index) for t in tokens ] results.sort(key=lambda n: n.start_index) return results def _extract_number_with_text_pl(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ This function extracts a number from a list of Tokens. Args: tokens str: the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and decimals. Returns: ReplaceableNumber """ number, tokens = \ _extract_number_with_text_pl_helper(tokens, short_scale, ordinals, fractional_numbers) return ReplaceableNumber(number, tokens) def _extract_number_with_text_pl_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Helper for _extract_number_with_text_en. This contains the real logic for parsing, but produces a result that needs a little cleaning (specific, it may contain leading articles that can be trimmed off). Args: tokens [Token]: short_scale boolean: ordinals boolean: fractional_numbers boolean: Returns: int or float, [Tokens] """ if fractional_numbers: fraction, fraction_text = \ _extract_fraction_with_text_pl(tokens, short_scale, ordinals) if fraction: return fraction, fraction_text decimal, decimal_text = \ _extract_decimal_with_text_pl(tokens, short_scale, ordinals) if decimal: return decimal, decimal_text return _extract_whole_number_with_text_pl(tokens, short_scale, ordinals) def _extract_fraction_with_text_pl(tokens, short_scale, ordinals): """ Extract fraction numbers from a string. This function handles text such as '2 and 3/4'. Note that "one half" or similar will be parsed by the whole number function. Args: tokens [Token]: words and their indexes in the original string. short_scale boolean: ordinals boolean: Returns: (int or float, [Token]) The value found, and the list of relevant tokens. (None, None) if no fraction value is found. """ for c in _FRACTION_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_pl(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_pl(partitions[2], short_scale, ordinals, fractional_numbers=True) if not numbers1 or not numbers2: return None, None # ensure first is not a fraction and second is a fraction num1 = numbers1[-1] num2 = numbers2[0] if num1.value >= 1 and 0 < num2.value < 1: return num1.value + num2.value, \ num1.tokens + partitions[1] + num2.tokens return None, None def _extract_decimal_with_text_pl(tokens, short_scale, ordinals): """ Extract decimal numbers from a string. This function handles text such as '2 point 5'. Notes: While this is a helper for extractnumber_en, it also depends on extractnumber_en, to parse out the components of the decimal. This does not currently handle things like: number dot number number number Args: tokens [Token]: The text to parse. short_scale boolean: ordinals boolean: Returns: (float, [Token]) The value found and relevant tokens. (None, None) if no decimal value is found. """ for c in _DECIMAL_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_pl(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_pl(partitions[2], short_scale, ordinals, fractional_numbers=False) if not numbers1 or not numbers2: return None, None number = numbers1[-1] decimal = numbers2[0] # TODO handle number dot number number number if "." not in str(decimal.text): return number.value + float('0.' + str(decimal.value)), \ number.tokens + partitions[1] + decimal.tokens return None, None def _extract_whole_number_with_text_pl(tokens, short_scale, ordinals): """ Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [Token]: short_scale boolean: ordinals boolean: Returns: int or float, [Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data(short_scale) number_words = [] # type: [Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word prev_word = tokens[idx - 1].word if idx > 0 else "" next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" if is_numeric(word[:-1]) and word.endswith('.'): # explicit ordinals, 1., 2., 3., 4.... N. word = word[:-1] word = normalize_word_pl(word) if word not in string_num_scale and \ word not in _STRING_NUM_PL and \ word not in _SUMS and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not isFractional_pl(word) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] if number_words and not all([w in _NEGATIVES for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SHORT_SCALE_PL.values() \ and prev_word not in _SUMS \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES: number_words = [token] elif prev_word in _SUMS and word in _SUMS: number_words = [token] else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_PL: val = _STRING_NUM_PL.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val prev_val = None # is the prev word a number and should we sum it? # twenty two, fifty six if prev_val: if (prev_word in string_num_ordinal and val and val < prev_val) or \ (prev_word in _STRING_NUM_PL and val and val < prev_val and val // 10 != prev_val // 10) or \ all([prev_word in multiplies, val < prev_val if prev_val else False]): val += prev_val if next_word in multiplies: prev_val = val continue # is this a spoken fraction? # half cup if val is False: val = isFractional_pl(word) current_val = val # 2 fifths if not ordinals: next_val = isFractional_pl(next_word) if next_val: if not val: val = 1 val *= next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES: val = 0 - val if next_word in _STRING_NUM_PL: prev_val = val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) number_words.append(tokens[idx + 1]) else: if all([ prev_word in _SUMS, word not in _SUMS, word not in multiplies, current_val >= 10]): # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val if word in multiplies and next_word not in multiplies: # handle long numbers # six hundred sixty six # two million five hundred thousand # # This logic is somewhat complex, and warrants # extensive documentation for the next coder's sake. # # The current word is a power of ten. `current_val` is # its integer value. `val` is our working sum # (above, when `current_val` is 1 million, `val` is # 2 million.) # # We have a dict `string_num_scale` containing [value, word] # pairs for "all" powers of ten: string_num_scale[10] == "ten. # # We need go over the rest of the tokens, looking for other # powers of ten. If we find one, we compare it with the current # value, to see if it's smaller than the current power of ten. # # Numbers which are not powers of ten will be passed over. # # If all the remaining powers of ten are smaller than our # current value, we can set the current value aside for later, # and begin extracting another portion of our final result. # For example, suppose we have the following string. # The current word is "million".`val` is 9000000. # `current_val` is 1000000. # # "nine **million** nine *hundred* seven **thousand** # six *hundred* fifty seven" # # Iterating over the rest of the string, the current # value is larger than all remaining powers of ten. # # The if statement passes, and nine million (9000000) # is appended to `to_sum`. # # The main variables are reset, and the main loop begins # assembling another number, which will also be appended # under the same conditions. # # By the end of the main loop, to_sum will be a list of each # "place" from 100 up: [9000000, 907000, 600] # # The final three digits will be added to the sum of that list # at the end of the main loop, to produce the extracted number: # # sum([9000000, 907000, 600]) + 57 # == 9,000,000 + 907,000 + 600 + 57 # == 9,907,657 # # >>> foo = "nine million nine hundred seven thousand six # hundred fifty seven" # >>> extract_number(foo) # 9907657 time_to_sum = True for other_token in tokens[idx+1:]: if other_token.word in multiplies: if string_num_scale[other_token.word] >= current_val: time_to_sum = False else: continue if not time_to_sum: break if time_to_sum: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words def _initialize_number_data(short_scale): """ Generate dictionaries of words to numbers, based on scale. This is a helper function for _extract_whole_number. Args: short_scale boolean: Returns: (set(str), dict(str, number), dict(str, number)) multiplies, string_num_ordinal, string_num_scale """ multiplies = _MULTIPLIES_SHORT_SCALE_PL string_num_scale = invert_dict(_SHORT_SCALE_PL) string_num_scale.update(generate_plurals_pl(string_num_scale)) return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale def extract_number_pl(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value def extract_duration_pl(text): """ Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None time_units = { 'microseconds': None, 'milliseconds': None, 'seconds': None, 'minutes': None, 'hours': None, 'days': None, 'weeks': None } pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ayeę]?" text = _convert_words_to_numbers_pl(text) for unit in _TIME_UNITS_CONVERSION: unit_pattern = pattern.format(unit=unit) matches = re.findall(unit_pattern, text) value = sum(map(float, matches)) unit_en = _TIME_UNITS_CONVERSION.get(unit) if time_units[unit_en] is None or time_units.get(unit_en) == 0: time_units[unit_en] = value text = re.sub(unit_pattern, '', text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return (duration, text) def extract_datetime_pl(string, dateNow=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: string (str): string containing date words dateNow (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ def clean_string(s): # clean unneeded punctuation and capitalization among other things. s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace("para", "2") wordList = s.split() for idx, word in enumerate(wordList): ordinals = ["ci", "szy", "gi"] if word[0].isdigit(): for ordinal in ordinals: if ordinal in word: word = word.replace(ordinal, "") wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if string == "": return None dateNow = dateNow or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersAM = ['rano'] timeQualifiersPM = ['wieczór', 'w nocy'] timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) markers = ['na', 'w', 'we', 'na', 'przez', 'ten', 'około', 'dla', 'o', "pomiędzy", 'za', 'do'] days = list(_DAYS_TO_EN.keys()) recur_markers = days + ['weekend', 'weekendy'] monthsShort = ['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', 'wrz', 'paź', 'lis', 'gru'] year_multiples = ['dekada', 'wiek', 'milenia'] words = clean_string(string) for idx, word in enumerate(words): if word == "": continue wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words start = idx used = 0 # save timequalifier for later if word == 'w' and wordNext == 'tę': used += 2 if word == "temu" and dayOffset: dayOffset = - dayOffset used += 1 if word == "teraz" and not datestr: resultStr = " ".join(words[idx + 1:]) resultStr = ' '.join(resultStr.split()) extractedDate = dateNow.replace(microsecond=0) return [extractedDate, resultStr] elif wordNext in year_multiples: multiplier = None if is_numeric(word): multiplier = extract_number_pl(word) multiplier = multiplier or 1 multiplier = int(multiplier) used += 2 if _TIME_UNITS_NORMALIZATION.get(wordNext) == "dekada": yearOffset = multiplier * 10 elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "wiek": yearOffset = multiplier * 100 elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "milenia": yearOffset = multiplier * 1000 elif word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "dzisiaj" and not fromFlag: dayOffset = 0 used += 1 elif word == "jutro" and not fromFlag: dayOffset = 1 used += 1 elif word == "przedwczoraj" and not fromFlag: dayOffset = -2 used += 1 elif word == "wczoraj" and not fromFlag: dayOffset = -1 used += 1 elif word == "pojutrze" and not fromFlag: dayOffset = 2 used = 1 elif word == "dzień" and wordNext != 'robocze': if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "tydzień" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "następny": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "poprzedni" or wordPrev == 'ostatni': dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "miesiąc" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "następny": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "poprzedni" or wordPrev == 'ostatni': monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "rok" and not fromFlag and wordPrev: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "następny": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "poprzedni" or wordPrev == 'ostatni': yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = _DAYS_TO_EN.get(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "następny": if dayOffset <= 2: dayOffset += 7 used += 1 start -= 1 elif wordPrev == "poprzedni" or wordPrev == 'ostatni': dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in _MONTHS_TO_EN or word in monthsShort and not fromFlag: used += 1 datestr = _MONTHS_TO_EN[word] if wordPrev and wordPrev[0].isdigit(): datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + list(_MONTHS_TO_EN.keys()) + monthsShort validFollowups.append("dzisiaj") validFollowups.append("jutro") validFollowups.append("wczoraj") validFollowups.append("następny") validFollowups.append("poprzedni") validFollowups.append('ostatni') validFollowups.append("teraz") validFollowups.append("tego") if (word == "od" or word == "po") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "jutro": dayOffset += 1 elif wordNext == "wczoraj": dayOffset -= 1 elif wordNext in days: d = _DAYS_TO_EN.get(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = _DAYS_TO_EN.get(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "następny": if dayOffset <= 2: tmpOffset += 7 used += 1 start -= 1 elif wordNext == "poprzedni" or wordNext == 'ostatni': tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] == "ten": # this start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "południe": hrAbs = 12 used += 1 elif word == "północ" or word == 'północy': hrAbs = 0 used += 1 elif word == "rano": if hrAbs is None: hrAbs = 8 used += 1 elif word == "po" and wordNext == "południu": if hrAbs is None: hrAbs = 15 used += 2 elif word == "wieczór" or word == 'wieczorem': if hrAbs is None: hrAbs = 19 used += 1 elif word == "nocy": if hrAbs is None: hrAbs = 22 used += 1 # parse half an hour, quarter hour elif word == "godzina" and (wordPrev.isdigit() or wordPrev in markers or wordPrevPrev in markers): if wordPrev == "pół": minOffset = 30 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "dzisiaj": daySpecified = True words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc # parse in a minute elif word == "minuta" and (wordPrev.isdigit() or wordPrev in markers): minOffset = 1 words[idx - 1] = "" used += 1 # parse in a second elif word == "sekunda" and (wordPrev.isdigit() or wordPrev in markers): secOffset = 1 words[idx - 1] = "" used += 1 elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if wordNext == "wieczorem" or wordPrev == "wieczorem" or \ wordNext == 'wieczór' or wordPrev == 'wieczór' or \ (wordNext == 'po' and wordNextNext == 'południu'): remainder = "pm" used += 2 if wordNext == 'po' else 1 if wordPrev == "wieczorem" or wordPrev == 'wieczór': words[idx - 1] = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": if wordNext == "rano": remainder = "am" used += 1 elif wordNext == "po" and wordNextNext == "południu": remainder = "pm" used += 2 elif wordNext == "wieczorem": remainder = "pm" used += 1 elif wordNext == "rano": remainder = "am" used += 1 elif wordNext == "w" and wordNextNext == "nocy": if strHH and int(strHH) > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": military = True if strHH and int(strHH) <= 12 and \ (timeQualifier in timeQualifiersPM): strHH += str(int(strHH) + 12) else: # try to parse numbers without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or (word[0].isdigit() and (wordNext == 'wieczorem' or wordNext == 'wieczór')) or (word[0].isdigit() and wordNext == 'po' and wordNextNext == 'południu') or (word[0].isdigit() and wordNext == 'w' and wordNextNext == 'nocy')): strHH = strNum remainder = "pm" used = 2 if wordNext in ['po', 'w'] else 1 elif ( remainder == "am" or (word[0].isdigit() and wordNext == 'rano')): strHH = strNum remainder = "am" used = 1 elif ( remainder in recur_markers or wordNext in recur_markers or wordNextNext in recur_markers or ( wordNext == 'w' and wordNextNext == 'dzień' and wordNextNextNext == 'robocze' )): # Ex: "7 on mondays" or "3 this friday" # Set strHH so that isTime == True # when am or pm is not specified strHH = strNum used = 1 else: if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": # "in 10 hours" hrOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "minuta" or \ _TIME_UNITS_NORMALIZATION.get(remainder) == "minuta": # "in 10 minutes" minOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "sekunda" \ or _TIME_UNITS_NORMALIZATION.get(remainder) == "sekunda": # in 5 seconds secOffset = int(strNum) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(strNum) > 100: # military time, eg. "3300 hours" strHH = str(int(strNum) // 100) strMM = str(int(strNum) % 100) military = True if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": used += 1 elif wordNext and wordNext[0].isdigit(): # military time, e.g. "04 38 hours" strHH = strNum strMM = wordNext military = True used += 1 elif ( wordNext == "" or wordNext == "w" or wordNext == 'nocy' or wordNextNext == 'nocy'): strHH = strNum strMM = "00" if wordNext == "za" or wordNextNext == "za": used += (1 if wordNext == "za" else 2) wordNextNextNext = words[idx + 3] \ if idx + 3 < len(words) else "" if (wordNextNext and (wordNextNext in timeQualifier or wordNextNextNext in timeQualifier)): if (wordNextNext in timeQualifiersPM or wordNextNextNext in timeQualifiersPM): remainder = "pm" used += 1 if (wordNextNext in timeQualifiersAM or wordNextNextNext in timeQualifiersAM): remainder = "am" used += 1 if timeQualifier != "": if timeQualifier in timeQualifiersPM: remainder = "pm" used += 1 elif timeQualifier in timeQualifiersAM: remainder = "am" used += 1 else: # TODO: Unsure if this is 100% accurate used += 1 military = True else: isTime = False HH = int(strHH) if strHH else 0 MM = int(strMM) if strMM else 0 HH = HH + 12 if remainder == "pm" and HH < 12 else HH HH = HH - 12 if remainder == "am" and HH >= 12 else HH if (not military and remainder not in ['am', 'pm'] and remainder not in _TIME_UNITS_NORMALIZATION and ((not daySpecified) or 0 <= dayOffset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed if dateNow.hour < HH or (dateNow.hour == HH and dateNow.minute < MM): pass # No modification needed elif dateNow.hour < HH + 12: HH += 12 else: # has passed, assume the next morning dayOffset += 1 if timeQualifier in timeQualifiersPM and HH < 12: HH += 12 if HH > 24 or MM > 59: isTime = False used = 0 if isTime: hrAbs = HH minAbs = MM used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): if idx + i >= len(words): break words[idx + i] = "" if wordPrev == "rano": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "wieczorem": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if wordPrev == "najbliższą": daySpecified = True if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" if wordPrevPrev == "najbliższą": daySpecified = True idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow.replace(microsecond=0) if datestr != "": # date included an explicit date, e.g. "june 5" or "june 2, 2017" try: temp = datetime.strptime(datestr, "%B %d") except ValueError: # Try again, allowing the year temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if not hasYear: temp = temp.replace(year=extractedDate.year, tzinfo=extractedDate.tzinfo) if extractedDate < temp: extractedDate = extractedDate.replace( year=int(currentYear), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extractedDate.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hrOffset == 0 and minOffset == 0 and secOffset == 0: extractedDate = extractedDate.replace(hour=0, minute=0, second=0) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: # If no time was supplied in the string set the time to default # time if it's available if hrAbs is None and minAbs is None and default_time is not None: hrAbs, minAbs = default_time.hour, default_time.minute else: hrAbs = hrAbs or 0 minAbs = minAbs or 0 extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "i" and \ words[idx - 1] == "" and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def isFractional_pl(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ lower_input = input_str.lower() if lower_input in _REV_FRACTITONS: return 1.0 / _REV_FRACTITONS[lower_input] return False def extract_numbers_pl(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] def normalize_word_pl(word): if word.startswith('jedn'): suffix = 'ą', 'ej', 'ym' if word.endswith(suffix): return 'jedna' if word == 'dwie': return 'dwa' return word def normalize_pl(text, remove_articles=True): """ Polish string normalization """ words = text.split() # this also removed extra spaces normalized = "" for word in words: if remove_articles and word in ["i"]: continue if word in _TIME_UNITS_NORMALIZATION: word = _TIME_UNITS_NORMALIZATION[word] if word in _REV_FRACTITONS: word = str(_REV_FRACTITONS[word]) if word in _ORDINAL_BASE_PL.values(): word = str(list(_ORDINAL_BASE_PL.keys())[list(_ORDINAL_BASE_PL.values()).index(word)]) if word in _NUM_STRING_PL.values(): word = str(list(_NUM_STRING_PL.keys())[list(_NUM_STRING_PL.values()).index(word)]) if word in _ALT_ORDINALS_PL.values(): word = str(list(_ALT_ORDINALS_PL.keys())[list(_ALT_ORDINALS_PL.values()).index(word)]) if word == 'następną' or word == 'następna' or word == 'następnym' or word == 'następnej': word = 'następny' elif word == 'ostatnią' or word == 'ostatnia' or word == 'ostatnim' or word == 'ostatniej' or \ word == 'poprzednią' or word == 'poprzednia' or word == 'poprzednim' or word == 'poprzedniej': word = 'poprzedni' elif word == 'jutra' or word == 'jutrze': word = 'jutro' elif word == 'wieczorem': word = 'wieczór' elif word == 'poranne': word = 'rano' normalized += " " + word return normalized[1:] # strip the initial space lingua-franca-release-v0.4.3/lingua_franca/lang/parse_pt.py000066400000000000000000001153061426211343400237400ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parse functions for Portuguese (PT-PT) TODO: numbers greater than 999999 TODO: date time pt """ from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions from lingua_franca.lang.common_data_pt import _NUMBERS_PT, \ _FEMALE_DETERMINANTS_PT, _FEMALE_ENDINGS_PT, \ _MALE_DETERMINANTS_PT, _MALE_ENDINGS_PT, _GENDERS_PT from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local import json import re def is_fractional_pt(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" aFrac = ["meio", "terço", "quarto", "quinto", "sexto", "setimo", "oitavo", "nono", "décimo"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 2) if input_str == "vigésimo": return 1.0 / 20 if input_str == "trigésimo": return 1.0 / 30 if input_str == "centésimo": return 1.0 / 100 if input_str == "milésimo": return 1.0 / 1000 if (input_str == "sétimo" or input_str == "septimo" or input_str == "séptimo"): return 1.0 / 7 return False def extract_number_pt(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. text = text.lower() aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _NUMBERS_PT: val = _NUMBERS_PT[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif is_fractional_pt(word): if not result: result = 1 result = result * is_fractional_pt(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["ponto", "virgula", "vírgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extract_number_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False class PortugueseNormalizer(Normalizer): with open(resolve_resource_file("text/pt-pt/normalize.json")) as f: _default_config = json.load(f) @staticmethod def tokenize(utterance): # Split things like 12% utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) # Split things like #1 utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) # Split things like amo-te utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", utterance) tokens = utterance.split() if tokens[-1] == '-': tokens = tokens[:-1] return tokens def normalize_pt(text, remove_articles=True): """ PT string normalization """ return PortugueseNormalizer().normalize(text, remove_articles) def extract_datetime_pt(text, anchorDate=None, default_time=None): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", "º", "ª"] noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", "ao", "aos"] for word in symbols: s = s.replace(word, "") for word in noise_words: s = s.replace(" " + word + " ", " ") s = s.lower().replace( "á", "a").replace( "ç", "c").replace( "à", "a").replace( "ã", "a").replace( "é", "e").replace( "è", "e").replace( "ê", "e").replace( "ó", "o").replace( "ò", "o").replace( "-", " ").replace( "_", "") # handle synonims and equivalents, "tomorrow early = tomorrow morning synonims = {"manha": ["manhazinha", "cedo", "cedinho"], "tarde": ["tardinha", "tarde"], "noite": ["noitinha", "anoitecer"], "todos": ["ao", "aos"], "em": ["do", "da", "dos", "das", "de"]} for syn in synonims: for word in synonims[syn]: s = s.replace(" " + word + " ", " " + syn + " ") # relevant plurals, cant just extract all s in pt wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", "minutos", "segundos", "nas", "nos", "proximas", "seguintes", "horas"] for _, word in enumerate(wordlist): s = s.replace(word, word.rstrip('s')) s = s.replace("meses", "mes").replace("anteriores", "anterior") return s def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" words = clean_string(text).split(" ") timeQualifiersList = ['manha', 'tarde', 'noite'] time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", "no", "dia", "hora"] days = ['segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 'domingo'] months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro'] monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', 'set', 'out', 'nov', 'dec'] nexts = ["proximo", "proxima"] suffix_nexts = ["seguinte", "subsequente", "seguir"] lasts = ["ultimo", "ultima"] suffix_lasts = ["passada", "passado", "anterior", "antes"] nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] prevs = ["antes", "ante", "previa", "previamente", "anterior"] froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", "depois", "por", "proxima", "proximo", "da", "do", "de"] thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", "nessa"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, yesterday elif word == "hoje" and not fromFlag: dayOffset = 0 used += 1 elif word == "amanha" and not fromFlag: dayOffset = 1 used += 1 elif word == "ontem" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" elif (word == "anteontem" or (word == "ante" and wordNext == "ontem")) and not fromFlag: dayOffset -= 2 used += 1 if wordNext == "ontem": used += 1 elif word == "ante" and wordNext == "ante" and wordNextNext == \ "ontem" and not fromFlag: dayOffset -= 3 used += 3 elif word == "anteanteontem" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow elif word == "depois" and wordNext == "amanha" and not fromFlag: dayOffset += 2 used = 2 # day before yesterday elif word == "antes" and wordNext == "ontem" and not fromFlag: dayOffset -= 2 used = 2 # parse 5 days, 10 weeks, last week, next week, week after elif word == "dia": if wordNext == "depois" or wordNext == "antes": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used += 1 elif (wordPrev and wordPrev[0].isdigit() and wordNext not in months and wordNext not in monthsShort): dayOffset += int(wordPrev) start -= 1 used += 2 elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ months and wordNextNext not in monthsShort: dayOffset += int(wordNext) start -= 1 used += 2 elif word == "semana" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 for w in nexts: if wordPrev == w: dayOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: dayOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: dayOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "mes" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: monthOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: monthOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: monthOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: monthOffset = -7 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "ano" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: yearOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: yearOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: yearOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: yearOffset = -7 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 for w in nexts: if wordPrev == w: dayOffset += 7 used += 1 start -= 1 for w in lasts: if wordPrev == w: dayOffset -= 7 used += 1 start -= 1 for w in suffix_nexts: if wordNext == w: dayOffset += 7 used += 1 start -= 1 for w in suffix_lasts: if wordNext == w: dayOffset -= 7 used += 1 start -= 1 if wordNext == "feira": used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): # 13 maio datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): # maio 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): # 13 dia maio datestr += " " + wordPrevPrev start -= 2 used += 2 if wordNext and word[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): # maio dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): datestr += " " + wordNextNextNext used += 1 hasYear = True else: hasYear = False if datestr in months: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("hoje") validFollowups.append("amanha") validFollowups.append("ontem") validFollowups.append("anteontem") validFollowups.append("agora") validFollowups.append("ja") validFollowups.append("ante") # TODO debug word "depois" that one is failing for some reason if word in froms and wordNext in validFollowups: if not (wordNext == "amanha" and wordNext == "ontem") and not ( word == "depois" or word == "antes" or word == "em"): used = 2 fromFlag = True if wordNext == "amanha" and word != "depois": dayOffset += 1 elif wordNext == "ontem": dayOffset -= 1 elif wordNext == "anteontem": dayOffset -= 2 elif wordNext == "ante" and wordNextNext == "ontem": dayOffset -= 2 elif (wordNext == "ante" and wordNextNext == "ante" and wordNextNextNext == "ontem"): dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if wordNextNext == "feira": used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: if wordNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNextNextNext: if wordNextNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset if wordNextNextNext == "feira": used += 1 if wordNext in months: used -= 1 if used > 0: if start - 1 > 0 and words[start - 1] in lists: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in lists: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "meio" and wordNext == "dia": hrAbs = 12 used += 2 elif word == "meia" and wordNext == "noite": hrAbs = 0 used += 2 elif word == "manha": if not hrAbs: hrAbs = 8 used += 1 elif word == "tarde": if not hrAbs: hrAbs = 15 used += 1 elif word == "meio" and wordNext == "tarde": if not hrAbs: hrAbs = 17 used += 2 elif word == "meio" and wordNext == "manha": if not hrAbs: hrAbs = 10 used += 2 elif word == "fim" and wordNext == "tarde": if not hrAbs: hrAbs = 19 used += 2 elif word == "fim" and wordNext == "manha": if not hrAbs: hrAbs = 11 used += 2 elif word == "tantas" and wordNext == "manha": if not hrAbs: hrAbs = 4 used += 2 elif word == "noite": if not hrAbs: hrAbs = 22 used += 1 # parse half an hour, quarter hour elif word == "hora" and \ (wordPrev in time_indicators or wordPrevPrev in time_indicators): if wordPrev == "meia": minOffset = 30 elif wordPrev == "quarto": minOffset = 15 elif wordPrevPrev == "quarto": minOffset = 15 if idx > 2 and words[idx - 3] in time_indicators: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in time_indicators: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "manha": remainder = "am" used += 1 elif wordNext == "tarde": remainder = "pm" used += 1 elif wordNext == "noite": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext in thises and wordNextNext == "manha": remainder = "am" used = 2 elif wordNext in thises and wordNextNext == "tarde": remainder = "pm" used = 2 elif wordNext in thises and wordNextNext == "noite": remainder = "pm" used = 2 else: if timeQualifier != "": military = True if strHH <= 12 and \ (timeQualifier == "manha" or timeQualifier == "tarde"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if (wordNext == "pm" or wordNext == "p.m." or wordNext == "tarde"): strHH = strNum remainder = "pm" used = 1 elif (wordNext == "am" or wordNext == "a.m." or wordNext == "manha"): strHH = strNum remainder = "am" used = 1 elif (int(word) > 100 and ( wordPrev == "o" or wordPrev == "oh" or wordPrev == "zero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif ( wordNext == "hora" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minuto": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "segundo": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif wordNext == "" or ( wordNext == "em" and wordNextNext == "ponto"): strHH = word strMM = 00 if wordNext == "em" and wordNextNext == "ponto": used += 2 if wordNextNextNext == "tarde": remainder = "pm" used += 1 elif wordNextNextNext == "manha": remainder = "am" used += 1 elif wordNextNextNext == "noite": if 0 > int(strHH) > 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext military = True used += 1 if wordNextNext == "hora": used += 1 else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if (remainder == "pm" and 0 < strHH < 12) else strHH strHH = strHH - 12 if (remainder == "am" and 0 < strHH >= 12) else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "em" or wordPrev == "ponto": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: words[idx - 1] = "" if idx > 1 and wordPrevPrev in time_indicators: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found: return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if extractedDate.tzinfo: temp = temp.replace(tzinfo=extractedDate.tzinfo) if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if (hrAbs or 0) != -1 and (minAbs or 0) != -1: if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) resultStr = _pt_pruning(resultStr) return [extractedDate, resultStr] def _pt_pruning(text, symbols=True, accents=True, agressive=True): # agressive pt word pruning words = ["a", "o", "os", "as", "de", "dos", "das", "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", "este", "esta", "deste", "desta", "neste", "nesta", "nesse", "nessa", "foi", "que"] if symbols: symbols = [".", ",", ";", ":", "!", "?", "º", "ª"] for symbol in symbols: text = text.replace(symbol, "") text = text.replace("-", " ").replace("_", " ") if accents: accents = {"a": ["á", "à", "ã", "â"], "e": ["ê", "è", "é"], "i": ["í", "ì"], "o": ["ò", "ó"], "u": ["ú", "ù"], "c": ["ç"]} for char in accents: for acc in accents[char]: text = text.replace(acc, char) if agressive: text_words = text.split(" ") for idx, word in enumerate(text_words): if word in words: text_words[idx] = "" text = " ".join(text_words) text = ' '.join(text.split()) return text def get_gender_pt(word, context=""): """ Guess the gender of a word Some languages assign genders to specific words. This method will attempt to determine the gender, optionally using the provided context sentence. Args: word (str): The word to look up context (str, optional): String containing word, for context Returns: str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, or None if unknown/or unused in the given language. """ # parse gender taking context into account word = word.lower() words = context.lower().split(" ") for idx, w in enumerate(words): if w == word and idx != 0: # in portuguese usually the previous word (a determinant) # assigns gender to the next word previous = words[idx - 1].lower() if previous in _MALE_DETERMINANTS_PT: return "m" elif previous in _FEMALE_DETERMINANTS_PT: return "f" # get gender using only the individual word # see if this word has the gender defined if word in _GENDERS_PT: return _GENDERS_PT[word] singular = word.rstrip("s") if singular in _GENDERS_PT: return _GENDERS_PT[singular] # in portuguese the last vowel usually defines the gender of a word # the gender of the determinant takes precedence over this rule for end_str in _FEMALE_ENDINGS_PT: if word.endswith(end_str): return "f" for end_str in _MALE_ENDINGS_PT: if word.endswith(end_str): return "m" return None lingua-franca-release-v0.4.3/lingua_franca/lang/parse_ru.py000066400000000000000000002012631426211343400237410ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.lang.common_data_ru import _NUM_STRING_RU, \ _LONG_ORDINAL_RU, _LONG_SCALE_RU, _SHORT_SCALE_RU, _SHORT_ORDINAL_RU, \ _FRACTION_STRING_RU, _MONTHS_CONVERSION, _MONTHS_RU, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_RU import re import json from lingua_franca import resolve_resource_file from lingua_franca.time import now_local def generate_plurals_ru(originals): """ Return a new set or dict containing the plural form of the original values, In English this means all with 's' appended to them. Args: originals set(str) or dict(str, any): values to pluralize Returns: set(str) or dict(str, any) """ suffixes = ["а", "ах", "ам", "ами", "ные", "ный", "ов", "ом", "ы"] if isinstance(originals, dict): return {key + suffix: value for key, value in originals.items() for suffix in suffixes} return {value + suffix for value in originals for suffix in suffixes} # negate next number (-2 = 0 - 2) _NEGATIVES = {"минус"} # sum the next number (twenty two = 20 + 2) _SUMS = {'двадцать', '20', 'тридцать', '30', 'сорок', '40', 'пятьдесят', '50', 'шестьдесят', '60', 'семьдесят', '70', 'восемьдесят', '80', 'девяносто', '90', 'сто', '100', 'двести', '200', 'триста', '300', 'четыреста', '400', 'пятьсот', '500', 'шестьсот', '600', 'семьсот', '700', 'восемьсот', '800', 'девятьсот', '900'} _MULTIPLIES_LONG_SCALE_RU = set(_LONG_SCALE_RU.values()) | \ generate_plurals_ru(_LONG_SCALE_RU.values()) _MULTIPLIES_SHORT_SCALE_RU = set(_SHORT_SCALE_RU.values()) | \ generate_plurals_ru(_SHORT_SCALE_RU.values()) # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) _FRACTION_MARKER = {"и", "с", " "} # decimal marker ( 1 point 5 = 1 + 0.5) _DECIMAL_MARKER = {"целая", "целых", "точка", "запятая"} _STRING_NUM_RU = invert_dict(_NUM_STRING_RU) _STRING_NUM_RU.update({ "тысяч": 1e3, }) _STRING_NUM_RU.update(generate_plurals_ru(_STRING_NUM_RU)) _STRING_NUM_RU.update({ "четверти": 0.25, "четвёртая": 0.25, "четвёртых": 0.25, "третья": 1 / 3, "третяя": 1 / 3, "вторая": 0.5, "вторых": 0.5, "половина": 0.5, "половиной": 0.5, "пол": 0.5, "одна": 1, "двойка": 2, "двое": 2, "пара": 2, "сот": 100, "сотен": 100, "сотни": 100, "сотня": 100, }) _WORDS_NEXT_RU = [ "будущая", "будущее", "будущей", "будущий", "будущим", "будущую", "новая", "новое", "новой", "новый", "новым", "следующая", "следующее", "следующей", "следующем", "следующий", "следующую", ] _WORDS_PREV_RU = [ "предыдущая", "предыдущем", "предыдущей", "предыдущий", "предыдущим", "предыдущую", "прошедшая", "прошедшем", "прошедшей", "прошедший", "прошедшим", "прошедшую", "прошлая", "прошлой", "прошлом", "прошлую", "прошлый", "прошлым", "том", "тот", ] _WORDS_CURRENT_RU = [ "данная", "данное", "данном", "данный", "настойщая", "настоящее", "настойщем", "настойщем", "настойщий", "нынешняя", "нынешнее", "нынешней", "нынешнем", "нынешний", "текущая", "текущее", "текущей", "текущем", "текущий", "это", "этим", "этой", "этом", "этот", "эту", ] _WORDS_NOW_RU = [ "теперь", "сейчас", ] _WORDS_MORNING_RU = ["утро", "утром"] _WORDS_DAY_RU = ["днём"] _WORDS_EVENING_RU = ["вечер", "вечером"] _WORDS_NIGHT_RU = ["ночь", "ночью"] _STRING_SHORT_ORDINAL_RU = invert_dict(_SHORT_ORDINAL_RU) _STRING_LONG_ORDINAL_RU = invert_dict(_LONG_ORDINAL_RU) def _convert_words_to_numbers_ru(text, short_scale=True, ordinals=False): """ Convert words in a string into their equivalent numbers. Args: text str: short_scale boolean: True if short scale numbers should be used. ordinals boolean: True if ordinals (e.g. first, second, third) should be parsed to their number values (1, 2, 3...) Returns: str The original text, with numbers subbed in where appropriate. """ text = text.lower() tokens = tokenize(text) numbers_to_replace = \ _extract_numbers_with_text_ru(tokens, short_scale, ordinals) numbers_to_replace.sort(key=lambda number: number.start_index) results = [] for token in tokens: if not numbers_to_replace or \ token.index < numbers_to_replace[0].start_index: results.append(token.word) else: if numbers_to_replace and \ token.index == numbers_to_replace[0].start_index: results.append(str(numbers_to_replace[0].value)) if numbers_to_replace and \ token.index == numbers_to_replace[0].end_index: numbers_to_replace.pop(0) return ' '.join(results) def _extract_numbers_with_text_ru(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Extract all numbers from a list of Tokens, with the words that represent them. Args: [Token]: The tokens to parse. short_scale bool: True if short scale numbers should be used, False for long scale. True by default. ordinals bool: True if ordinal words (first, second, third, etc) should be parsed. fractional_numbers bool: True if we should look for fractions and decimals. Returns: [ReplaceableNumber]: A list of tuples, each containing a number and a string. """ placeholder = "" # inserted to maintain correct indices results = [] while True: to_replace = \ _extract_number_with_text_ru(tokens, short_scale, ordinals, fractional_numbers) if not to_replace: break results.append(to_replace) tokens = [ t if not to_replace.start_index <= t.index <= to_replace.end_index else Token(placeholder, t.index) for t in tokens ] results.sort(key=lambda n: n.start_index) return results def _extract_number_with_text_ru(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ This function extracts a number from a list of Tokens. Args: tokens str: the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and decimals. Returns: ReplaceableNumber """ number, tokens = \ _extract_number_with_text_ru_helper(tokens, short_scale, ordinals, fractional_numbers) return ReplaceableNumber(number, tokens) def _extract_number_with_text_ru_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True): """ Helper for _extract_number_with_text_en. This contains the real logic for parsing, but produces a result that needs a little cleaning (specific, it may contain leading articles that can be trimmed off). Args: tokens [Token]: short_scale boolean: ordinals boolean: fractional_numbers boolean: Returns: int or float, [Tokens] """ if fractional_numbers: fraction, fraction_text = \ _extract_fraction_with_text_ru(tokens, short_scale, ordinals) if fraction: return fraction, fraction_text decimal, decimal_text = \ _extract_decimal_with_text_ru(tokens, short_scale, ordinals) if decimal: return decimal, decimal_text return _extract_whole_number_with_text_ru(tokens, short_scale, ordinals) def _extract_fraction_with_text_ru(tokens, short_scale, ordinals): """ Extract fraction numbers from a string. This function handles text such as '2 and 3/4'. Note that "one half" or similar will be parsed by the whole number function. Args: tokens [Token]: words and their indexes in the original string. short_scale boolean: ordinals boolean: Returns: (int or float, [Token]) The value found, and the list of relevant tokens. (None, None) if no fraction value is found. """ for c in _FRACTION_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_ru(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_ru(partitions[2], short_scale, ordinals, fractional_numbers=True) if not numbers1 or not numbers2: return None, None # ensure first is not a fraction and second is a fraction num1 = numbers1[-1] num2 = numbers2[0] if num1.value >= 1 and 0 < num2.value < 1: return num1.value + num2.value, \ num1.tokens + partitions[1] + num2.tokens return None, None def _extract_decimal_with_text_ru(tokens, short_scale, ordinals): """ Extract decimal numbers from a string. This function handles text such as '2 point 5'. Notes: While this is a helper for extract_number_xx, it also depends on extract_number_xx, to parse out the components of the decimal. This does not currently handle things like: number dot number number number Args: tokens [Token]: The text to parse. short_scale boolean: ordinals boolean: Returns: (float, [Token]) The value found and relevant tokens. (None, None) if no decimal value is found. """ for c in _DECIMAL_MARKER: partitions = partition_list(tokens, lambda t: t.word == c) if len(partitions) == 3: numbers1 = \ _extract_numbers_with_text_ru(partitions[0], short_scale, ordinals, fractional_numbers=False) numbers2 = \ _extract_numbers_with_text_ru(partitions[2], short_scale, ordinals, fractional_numbers=False) if not numbers1 or not numbers2: return None, None number = numbers1[-1] decimal = numbers2[0] # TODO handle number dot number number number if "." not in str(decimal.text): return number.value + float('0.' + str(decimal.value)), \ number.tokens + partitions[1] + decimal.tokens return None, None def _extract_whole_number_with_text_ru(tokens, short_scale, ordinals): """ Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [Token]: short_scale boolean: ordinals boolean: Returns: int or float, [Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data(short_scale) number_words = [] # type: [Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word if word in word in _NEGATIVES: number_words.append(token) continue prev_word = tokens[idx - 1].word if idx > 0 else "" next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" # In Russian (?) we do no use suffix (1st,2nd,..) but use point instead (1.,2.,..) if is_numeric(word[:-1]) and \ (word.endswith(".")): # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth word = word[:-1] # handle nth one # if next_word == "one": # would return 1 instead otherwise # tokens[idx + 1] = Token("", idx) # next_word = "" # Normalize Russian inflection of numbers (один, одна, одно,...) if not ordinals: word = _text_ru_inflection_normalize(word, 1) if word not in string_num_scale and \ word not in _STRING_NUM_RU and \ word not in _SUMS and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not is_fractional_ru(word, short_scale=short_scale) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] if number_words and not all([w in _NEGATIVES for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SUMS \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES: number_words = [token] elif prev_word in _SUMS and word in _SUMS: number_words = [token] else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_RU: val = _STRING_NUM_RU.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal and val == 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if (prev_word in _SUMS and val and val < 10) \ or (prev_word in _SUMS and val and val < 100 and prev_val >= 100) \ or all([prev_word in multiplies, val < prev_val if prev_val else False]): val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = is_fractional_ru(word, short_scale=short_scale) current_val = val # 2 fifths if not ordinals: next_val = is_fractional_ru(next_word, short_scale=short_scale) if next_val: if not val: val = 1 val = val * next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" a_pieces = word.split('/') if look_for_fractions(a_pieces): val = float(a_pieces[0]) / float(a_pieces[1]) else: if all([ prev_word in _SUMS, word not in _SUMS, word not in multiplies, current_val >= 10 ]): # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val if word in multiplies and next_word not in multiplies: # handle long numbers # six hundred sixty six # two million five hundred thousand # # This logic is somewhat complex, and warrants # extensive documentation for the next coder's sake. # # The current word is a power of ten. `current_val` is # its integer value. `val` is our working sum # (above, when `current_val` is 1 million, `val` is # 2 million.) # # We have a dict `string_num_scale` containing [value, word] # pairs for "all" powers of ten: string_num_scale[10] == "ten. # # We need go over the rest of the tokens, looking for other # powers of ten. If we find one, we compare it with the current # value, to see if it's smaller than the current power of ten. # # Numbers which are not powers of ten will be passed over. # # If all the remaining powers of ten are smaller than our # current value, we can set the current value aside for later, # and begin extracting another portion of our final result. # For example, suppose we have the following string. # The current word is "million".`val` is 9000000. # `current_val` is 1000000. # # "nine **million** nine *hundred* seven **thousand** # six *hundred* fifty seven" # # Iterating over the rest of the string, the current # value is larger than all remaining powers of ten. # # The if statement passes, and nine million (9000000) # is appended to `to_sum`. # # The main variables are reset, and the main loop begins # assembling another number, which will also be appended # under the same conditions. # # By the end of the main loop, to_sum will be a list of each # "place" from 100 up: [9000000, 907000, 600] # # The final three digits will be added to the sum of that list # at the end of the main loop, to produce the extracted number: # # sum([9000000, 907000, 600]) + 57 # == 9,000,000 + 907,000 + 600 + 57 # == 9,907,657 # # >>> foo = "nine million nine hundred seven thousand six # hundred fifty seven" # >>> extract_number(foo) # 9907657 time_to_sum = True for other_token in tokens[idx + 1:]: if other_token.word in multiplies: if string_num_scale[other_token.word] >= current_val: time_to_sum = False else: continue if not time_to_sum: break if time_to_sum: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words def _initialize_number_data(short_scale): """ Generate dictionaries of words to numbers, based on scale. This is a helper function for _extract_whole_number. Args: short_scale boolean: Returns: (set(str), dict(str, number), dict(str, number)) multiplies, string_num_ordinal, string_num_scale """ multiplies = _MULTIPLIES_SHORT_SCALE_RU if short_scale \ else _MULTIPLIES_LONG_SCALE_RU string_num_ordinal_ru = _STRING_SHORT_ORDINAL_RU if short_scale \ else _STRING_LONG_ORDINAL_RU string_num_scale_ru = _SHORT_SCALE_RU if short_scale else _LONG_SCALE_RU string_num_scale_ru = invert_dict(string_num_scale_ru) string_num_scale_ru.update(generate_plurals_ru(string_num_scale_ru)) return multiplies, string_num_ordinal_ru, string_num_scale_ru def extract_number_ru(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ return _extract_number_with_text_ru(tokenize(text.lower()), short_scale, ordinals).value def extract_duration_ru(text): """ Convert an english phrase into a number of seconds Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ if not text: return None # Russian inflection for time: минута, минуты, минут - safe to use минута as pattern # For day: день, дня, дней - short pattern not applicable, list all time_units = { 'microseconds': 0, 'milliseconds': 0, 'seconds': 0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0 } pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}(?:а|ов|у|ут|уту)?" text = _convert_words_to_numbers_ru(text) for (unit_ru, unit_en) in _TIME_UNITS_CONVERSION.items(): unit_pattern = pattern.format(unit=unit_ru) def repl(match): time_units[unit_en] += float(match.group(1)) return '' text = re.sub(unit_pattern, repl, text) text = text.strip() duration = timedelta(**time_units) if any(time_units.values()) else None return duration, text def extract_datetime_ru(text, anchor_date=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like "today" "tomorrow afternoon" "next Tuesday at 4pm" "August 3rd" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string "what is Tuesday's weather forecast" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string "what is weather forecast". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. On Saturday, "next Monday" would be in 9 days. Args: text (str): string containing date words anchor_date (datetime): A reference date/time for "tommorrow", etc default_time (time): Time to set if no time was found in the string Returns: [datetime, str]: An array containing the datetime and the remaining text not consumed in the parsing, or None if no date or time related text was found. """ def clean_string(s): # clean unneeded punctuation and capitalization among other things. # Normalize Russian inflection s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace("сегодня вечером", "вечером") \ .replace("сегодня ночью", "ночью") word_list = s.split() for idx, word in enumerate(word_list): # word = word.replace("'s", "") ########## # Russian Day Ordinals - we do not use 1st,2nd format # instead we use full ordinal number names with specific format(suffix) # Example: тридцать первого > 31 count_ordinals = 0 if word == "первого": count_ordinals = 1 # These two have different format elif word == "третьего": count_ordinals = 3 elif word.endswith("ого"): tmp = word[:-3] tmp += "ый" for nr, name in _ORDINAL_BASE_RU.items(): if name == tmp: count_ordinals = nr # If number is bigger than 19 check if next word is also ordinal # and count them together if count_ordinals > 19: if word_list[idx + 1] == "первого": count_ordinals += 1 # These two have different format elif word_list[idx + 1] == "третьего": count_ordinals += 3 elif word_list[idx + 1].endswith("ого"): tmp = word_list[idx + 1][:-3] tmp += "ый" for nr, name in _ORDINAL_BASE_RU.items(): if name == tmp and nr < 10: # write only if sum makes acceptable count of days in month if (count_ordinals + nr) <= 31: count_ordinals += nr if count_ordinals > 0: word = str(count_ordinals) # Write normalized value into word if count_ordinals > 20: # If counted number is greater than 20, clear next word so it is not used again word_list[idx + 1] = "" ########## # Remove inflection from Russian months word_list[idx] = word return word_list def date_found(): return found or \ ( date_string != "" or year_offset != 0 or month_offset != 0 or day_offset is True or hr_offset != 0 or hr_abs or min_offset != 0 or min_abs or sec_offset != 0 ) if text == "": return None anchor_date = anchor_date or now_local() found = False day_specified = False day_offset = False month_offset = 0 year_offset = 0 today = anchor_date.strftime("%w") current_year = anchor_date.strftime("%Y") from_flag = False date_string = "" has_year = False time_qualifier = "" time_qualifiers_am = _WORDS_MORNING_RU time_qualifiers_pm = ['дня', 'вечера'] time_qualifiers_pm.extend(_WORDS_DAY_RU) time_qualifiers_pm.extend(_WORDS_EVENING_RU) time_qualifiers_pm.extend(_WORDS_NIGHT_RU) time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) markers = ['на', 'в', 'во', 'до', 'на', 'это', 'около', 'этот', 'через', 'спустя', 'за', 'тот'] days = ['понедельник', 'вторник', 'среда', 'четверг', 'пятница', 'суббота', 'воскресенье'] months = _MONTHS_RU recur_markers = days + ['выходные', 'викенд'] months_short = ['янв', 'фев', 'мар', 'апр', 'май', 'июн', 'июл', 'авг', 'сен', 'окт', 'ноя', 'дек'] year_multiples = ["десятилетие", "век", "тысячелетие"] words = clean_string(text) preposition = "" for idx, word in enumerate(words): if word == "": continue if word in markers: preposition = word word = _text_ru_inflection_normalize(word, 2) word_prev_prev = _text_ru_inflection_normalize( words[idx - 2], 2) if idx > 1 else "" word_prev = _text_ru_inflection_normalize( words[idx - 1], 2) if idx > 0 else "" word_next = _text_ru_inflection_normalize( words[idx + 1], 2) if idx + 1 < len(words) else "" word_next_next = _text_ru_inflection_normalize( words[idx + 2], 2) if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words start = idx used = 0 if word in _WORDS_NOW_RU and not date_string: result_str = " ".join(words[idx + 1:]) result_str = ' '.join(result_str.split()) extracted_date = anchor_date.replace(microsecond=0) return [extracted_date, result_str] elif word_next in year_multiples: multiplier = None if is_numeric(word): multiplier = extract_number_ru(word) multiplier = multiplier or 1 multiplier = int(multiplier) used += 2 if word_next == "десятилетие": year_offset = multiplier * 10 elif word_next == "век": year_offset = multiplier * 100 elif word_next == "тысячелетие": year_offset = multiplier * 1000 elif word in time_qualifiers_list and preposition != "через" and word_next != "назад": time_qualifier = word # parse today, tomorrow, day after tomorrow elif word == "сегодня" and not from_flag: day_offset = 0 used += 1 elif word == "завтра" and not from_flag: day_offset = 1 used += 1 elif word == "послезавтра" and not from_flag: day_offset = 2 used += 1 elif word == "после" and word_next == "завтра" and not from_flag: day_offset = 2 used += 2 elif word == "позавчера" and not from_flag: day_offset = -2 used += 1 elif word == "вчера" and not from_flag: day_offset = -1 used += 1 elif (word in ["день", "дня"] and word_next == "после" and word_next_next == "завтра" and not from_flag and (not word_prev or not word_prev[0].isdigit())): day_offset = 2 used = 2 elif word in ["день", "дня"] and is_numeric(word_prev) and preposition == "через": if word_prev and word_prev[0].isdigit(): day_offset += int(word_prev) start -= 1 used = 2 elif word in ["день", "дня"] and is_numeric(word_prev) and word_next == "назад": if word_prev and word_prev[0].isdigit(): day_offset += -int(word_prev) start -= 1 used = 3 elif word == "сегодня" and not from_flag and word_prev: if word_prev[0].isdigit(): day_offset += int(word_prev) * 7 start -= 1 used = 2 elif word_prev in _WORDS_NEXT_RU: day_offset = 7 start -= 1 used = 2 elif word_prev in _WORDS_PREV_RU: day_offset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "неделя" and not from_flag and preposition in ["через", "на"]: if word_prev[0].isdigit(): day_offset = int(word_prev) * 7 start -= 1 used = 2 elif word_prev in _WORDS_NEXT_RU: day_offset = 7 start -= 1 used = 2 elif word_prev in _WORDS_PREV_RU: day_offset = -7 start -= 1 used = 2 elif word == "месяц" and not from_flag and preposition in ["через", "на"]: if word_prev[0].isdigit(): month_offset = int(word_prev) start -= 1 used = 2 elif word_prev in _WORDS_NEXT_RU: month_offset = 1 start -= 1 used = 2 elif word_prev in _WORDS_PREV_RU: month_offset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "год" and not from_flag and preposition in ["через", "на"]: if word_prev[0].isdigit(): year_offset = int(word_prev) start -= 1 used = 2 elif word_prev in _WORDS_NEXT_RU: year_offset = 1 start -= 1 used = 2 elif word_prev in _WORDS_PREV_RU: year_offset = -1 start -= 1 used = 2 elif word_prev == "через": year_offset = 1 used = 1 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not from_flag: d = days.index(word) day_offset = (d + 1) - int(today) used = 1 if day_offset < 0: day_offset += 7 if word_prev in _WORDS_NEXT_RU: if day_offset <= 2: day_offset += 7 used += 1 start -= 1 elif word_prev in _WORDS_PREV_RU: day_offset -= 7 used += 1 start -= 1 elif word in months or word in months_short and not from_flag: try: m = months.index(word) except ValueError: m = months_short.index(word) used += 1 # Convert Russian months to english date_string = _MONTHS_CONVERSION.get(m) if word_prev and (word_prev[0].isdigit() or (word_prev == " " and word_prev_prev[0].isdigit())): if word_prev == " " and word_prev_prev[0].isdigit(): date_string += " " + words[idx - 2] used += 1 start -= 1 else: date_string += " " + word_prev start -= 1 used += 1 if word_next and word_next[0].isdigit(): date_string += " " + word_next used += 1 has_year = True else: has_year = False elif word_next and word_next[0].isdigit(): date_string += " " + word_next used += 1 if word_next_next and word_next_next[0].isdigit(): date_string += " " + word_next_next used += 1 has_year = True else: has_year = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July valid_followups = days + months + months_short valid_followups.append("сегодня") valid_followups.append("завтра") valid_followups.append("послезавтра") valid_followups.append("вчера") valid_followups.append("позавчера") for followup in _WORDS_NEXT_RU: valid_followups.append(followup) for followup in _WORDS_PREV_RU: valid_followups.append(followup) for followup in _WORDS_CURRENT_RU: valid_followups.append(followup) for followup in _WORDS_NOW_RU: valid_followups.append(followup) if (word in ["до", "по", "от", "с", "со"]) and word_next in valid_followups: used = 2 from_flag = True if word_next == "завтра": day_offset += 1 elif word_next == "послезавтра": day_offset += 2 elif word_next == "вчера": day_offset -= 1 elif word_next == "позавчера": day_offset -= 2 elif word_next in days: d = days.index(word_next) tmp_offset = (d + 1) - int(today) used = 2 if tmp_offset < 0: tmp_offset += 7 day_offset += tmp_offset elif word_next_next and word_next_next in days: d = days.index(word_next_next) tmp_offset = (d + 1) - int(today) used = 3 if word_next in _WORDS_NEXT_RU: if day_offset <= 2: tmp_offset += 7 used += 1 start -= 1 elif word_next in _WORDS_PREV_RU: tmp_offset -= 7 used += 1 start -= 1 day_offset += tmp_offset if used > 0: if start - 1 > 0 and (words[start - 1] in _WORDS_CURRENT_RU): start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True day_specified = True # parse time hr_offset = 0 min_offset = 0 sec_offset = 0 hr_abs = None min_abs = None military = False preposition = "" for idx, word in enumerate(words): if word == "": continue if word in markers: preposition = word word = _text_ru_inflection_normalize(word, 2) word_prev_prev = _text_ru_inflection_normalize( words[idx - 2], 2) if idx > 1 else "" word_prev = _text_ru_inflection_normalize( words[idx - 1], 2) if idx > 0 else "" word_next = _text_ru_inflection_normalize( words[idx + 1], 2) if idx + 1 < len(words) else "" word_next_next = _text_ru_inflection_normalize( words[idx + 2], 2) if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "полдень": hr_abs = 12 used += 1 elif word == "полночь": hr_abs = 0 used += 1 elif word in _WORDS_MORNING_RU: if hr_abs is None: hr_abs = 8 used += 1 elif word in _WORDS_DAY_RU: if hr_abs is None: hr_abs = 15 used += 1 elif word in _WORDS_EVENING_RU: if hr_abs is None: hr_abs = 19 used += 1 if word_next != "" and word_next[0].isdigit() and ":" in word_next: used -= 1 elif word in _WORDS_NIGHT_RU: if hr_abs is None: hr_abs = 22 # parse half an hour, quarter hour elif word == "час" and \ (word_prev in markers or word_prev_prev in markers): if word_prev in ["пол", "половина"]: min_offset = 30 elif word_prev == "четверть": min_offset = 15 elif word_prev == "через": hr_offset = 1 else: hr_offset = 1 if word_prev_prev in markers: words[idx - 2] = "" if word_prev_prev in _WORDS_CURRENT_RU: day_specified = True words[idx - 1] = "" used += 1 hr_abs = -1 min_abs = -1 # parse 5:00 am, 12:00 p.m., etc # parse in a minute elif word == "минута" and word_prev == "через": min_offset = 1 words[idx - 1] = "" used += 1 # parse in a second elif word == "секунда" and word_prev == "через": sec_offset = 1 words[idx - 1] = "" used += 1 elif word[0].isdigit(): is_time = True str_hh = "" str_mm = "" remainder = "" word_next_next_next = words[idx + 3] \ if idx + 3 < len(words) else "" if word_next in _WORDS_EVENING_RU or word_next in _WORDS_NIGHT_RU or word_next_next in _WORDS_EVENING_RU \ or word_next_next in _WORDS_NIGHT_RU or word_prev in _WORDS_EVENING_RU \ or word_prev in _WORDS_NIGHT_RU or word_prev_prev in _WORDS_EVENING_RU \ or word_prev_prev in _WORDS_NIGHT_RU or word_next_next_next in _WORDS_EVENING_RU \ or word_next_next_next in _WORDS_NIGHT_RU: remainder = "pm" used += 1 if word_prev in _WORDS_EVENING_RU or word_prev in _WORDS_NIGHT_RU: words[idx - 1] = "" if word_prev_prev in _WORDS_EVENING_RU or word_prev_prev in _WORDS_NIGHT_RU: words[idx - 2] = "" if word_next_next in _WORDS_EVENING_RU or word_next_next in _WORDS_NIGHT_RU: used += 1 if word_next_next_next in _WORDS_EVENING_RU or word_next_next_next in _WORDS_NIGHT_RU: used += 1 if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): str_hh += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): str_mm += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": next_word = word_next.replace(".", "") if next_word in ["am", "pm", "ночи", "утра", "дня", "вечера"]: remainder = next_word used += 1 elif next_word == "часа" and word_next_next in ["am", "pm", "ночи", "утра", "дня", "вечера"]: remainder = word_next_next used += 2 elif word_next in _WORDS_MORNING_RU: remainder = "am" used += 2 elif word_next in _WORDS_DAY_RU: remainder = "pm" used += 2 elif word_next in _WORDS_EVENING_RU: remainder = "pm" used += 2 elif word_next == "этого" and word_next_next in _WORDS_MORNING_RU: remainder = "am" used = 2 day_specified = True elif word_next == "на" and word_next_next in _WORDS_DAY_RU: remainder = "pm" used = 2 day_specified = True elif word_next == "на" and word_next_next in _WORDS_EVENING_RU: remainder = "pm" used = 2 day_specified = True elif word_next == "в" and word_next_next in _WORDS_NIGHT_RU: if str_hh and int(str_hh) > 5: remainder = "pm" else: remainder = "am" used += 2 elif hr_abs and hr_abs != -1: if hr_abs >= 12: remainder = "pm" else: remainder = "am" used += 1 else: if time_qualifier != "": military = True if str_hh and int(str_hh) <= 12 and \ (time_qualifier in time_qualifiers_pm): str_hh += str(int(str_hh) + 12) else: # try to parse numbers without colons # 5 hours, 10 minutes etc. length = len(word) str_num = "" remainder = "" for i in range(length): if word[i].isdigit(): str_num += word[i] else: remainder += word[i] if remainder == "": remainder = word_next.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or word_next == "pm" or remainder == "p.m." or word_next == "p.m." or (remainder == "дня" and preposition != 'через') or (word_next == "дня" and preposition != 'через') or remainder == "вечера" or word_next == "вечера"): str_hh = str_num remainder = "pm" used = 1 if ( remainder == "pm" or word_next == "pm" or remainder == "p.m." or word_next == "p.m." or (remainder == "дня" and preposition != 'через') or (word_next == "дня" and preposition != 'через') or remainder == "вечера" or word_next == "вечера"): str_hh = str_num remainder = "pm" used = 1 elif ( remainder == "am" or word_next == "am" or remainder == "a.m." or word_next == "a.m." or remainder == "ночи" or word_next == "ночи" or remainder == "утра" or word_next == "утра"): str_hh = str_num remainder = "am" used = 1 elif ( remainder in recur_markers or word_next in recur_markers or word_next_next in recur_markers): # Ex: "7 on mondays" or "3 this friday" # Set str_hh so that is_time == True # when am or pm is not specified str_hh = str_num used = 1 else: if int(str_num) > 100: str_hh = str(int(str_num) // 100) str_mm = str(int(str_num) % 100) military = True if word_next == "час": used += 1 elif ( (word_next == "час" or remainder == "час") and word[0] != '0' and # (wordPrev != "в" and wordPrev != "на") word_prev == "через" and ( int(str_num) < 100 or int(str_num) > 2400 )): # ignores military time # "in 3 hours" hr_offset = int(str_num) used = 2 is_time = False hr_abs = -1 min_abs = -1 elif word_next == "минута" or \ remainder == "минута": # "in 10 minutes" min_offset = int(str_num) used = 2 is_time = False hr_abs = -1 min_abs = -1 elif word_next == "секунда" \ or remainder == "секунда": # in 5 seconds sec_offset = int(str_num) used = 2 is_time = False hr_abs = -1 min_abs = -1 elif int(str_num) > 100: # military time, eg. "3300 hours" str_hh = str(int(str_num) // 100) str_mm = str(int(str_num) % 100) military = True if word_next == "час" or \ remainder == "час": used += 1 elif word_next and word_next[0].isdigit(): # military time, e.g. "04 38 hours" str_hh = str_num str_mm = word_next military = True used += 1 if (word_next_next == "час" or remainder == "час"): used += 1 elif ( word_next == "" or word_next == "час" or ( (word_next == "в" or word_next == "на") and ( word_next_next == time_qualifier ) ) or word_next in _WORDS_EVENING_RU or word_next_next in _WORDS_EVENING_RU): str_hh = str_num str_mm = "00" if word_next == "час": used += 1 if (word_next == "в" or word_next == "на" or word_next_next == "в" or word_next_next == "на"): used += (1 if (word_next == "в" or word_next == "на") else 2) word_next_next_next = words[idx + 3] \ if idx + 3 < len(words) else "" if (word_next_next and (word_next_next in time_qualifier or word_next_next_next in time_qualifier)): if (word_next_next in time_qualifiers_pm or word_next_next_next in time_qualifiers_pm): remainder = "pm" used += 1 if (word_next_next in time_qualifiers_am or word_next_next_next in time_qualifiers_am): remainder = "am" used += 1 if time_qualifier != "": if time_qualifier in time_qualifiers_pm: remainder = "pm" used += 1 elif time_qualifier in time_qualifiers_am: remainder = "am" used += 1 else: # TODO: Unsure if this is 100% accurate used += 1 military = True elif remainder == "час": if word_next_next in ["ночи", "утра"]: remainder = "am" used += 1 elif word_next_next in ["дня", "вечера"]: remainder = "pm" used += 1 else: remainder = "" else: is_time = False hh = int(str_hh) if str_hh else 0 mm = int(str_mm) if str_mm else 0 hh = hh + 12 if remainder == "pm" and hh < 12 else hh hh = hh - 12 if remainder == "am" and hh >= 12 else hh if (not military and remainder not in ['am', 'pm', 'час', 'минута', 'секунда'] and ((not day_specified) or 0 <= day_offset < 1)): # ambiguous time, detect whether they mean this evening or # the next morning based on whether it has already passed if anchor_date.hour < hh or (anchor_date.hour == hh and anchor_date.minute < mm): pass # No modification needed elif anchor_date.hour < hh + 12: hh += 12 else: # has passed, assume the next morning day_offset += 1 if time_qualifier in time_qualifiers_pm and hh < 12: hh += 12 if hh > 24 or mm > 59: is_time = False used = 0 if is_time: hr_abs = hh min_abs = mm used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): if idx + i >= len(words): break words[idx + i] = "" # if wordPrev == "o" or wordPrev == "oh": # words[words.index(wordPrev)] = "" if word_prev == "скоро": hr_offset = -1 words[idx - 1] = "" idx -= 1 elif word_prev == "позже": hr_offset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and word_prev in markers: words[idx - 1] = "" if word_prev in _WORDS_CURRENT_RU: day_specified = True if idx > 1 and word_prev_prev in markers: words[idx - 2] = "" if word_prev_prev in _WORDS_CURRENT_RU: day_specified = True idx += used - 1 found = True # check that we found a date if not date_found(): return None if day_offset is False: day_offset = 0 # perform date manipulation extracted_date = anchor_date.replace(microsecond=0) if date_string != "": # date included an explicit date, e.g. "june 5" or "june 2, 2017" try: temp = datetime.strptime(date_string, "%B %d") except ValueError: # Try again, allowing the year temp = datetime.strptime(date_string, "%B %d %Y") extracted_date = extracted_date.replace(hour=0, minute=0, second=0) if not has_year: temp = temp.replace(year=extracted_date.year, tzinfo=extracted_date.tzinfo) if extracted_date < temp: extracted_date = extracted_date.replace( year=int(current_year), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extracted_date.tzinfo) else: extracted_date = extracted_date.replace( year=int(current_year) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extracted_date.tzinfo) else: extracted_date = extracted_date.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d")), tzinfo=extracted_date.tzinfo) else: # ignore the current HH:MM:SS if relative using days or greater if hr_offset == 0 and min_offset == 0 and sec_offset == 0: extracted_date = extracted_date.replace(hour=0, minute=0, second=0) if year_offset != 0: extracted_date = extracted_date + relativedelta(years=year_offset) if month_offset != 0: extracted_date = extracted_date + relativedelta(months=month_offset) if day_offset != 0: extracted_date = extracted_date + relativedelta(days=day_offset) if hr_abs != -1 and min_abs != -1: # If no time was supplied in the string set the time to default # time if it's available if hr_abs is None and min_abs is None and default_time is not None: hr_abs, min_abs = default_time.hour, default_time.minute else: hr_abs = hr_abs or 0 min_abs = min_abs or 0 extracted_date = extracted_date + relativedelta(hours=hr_abs, minutes=min_abs) if (hr_abs != 0 or min_abs != 0) and date_string == "": if not day_specified and anchor_date > extracted_date: extracted_date = extracted_date + relativedelta(days=1) if hr_offset != 0: extracted_date = extracted_date + relativedelta(hours=hr_offset) if min_offset != 0: extracted_date = extracted_date + relativedelta(minutes=min_offset) if sec_offset != 0: extracted_date = extracted_date + relativedelta(seconds=sec_offset) for idx, word in enumerate(words): if words[idx] == "и" and \ words[idx - 1] == "" and words[idx + 1] == "": words[idx] = "" result_str = " ".join(words) result_str = ' '.join(result_str.split()) return [extracted_date, result_str] def is_fractional_ru(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str[-3:] in ["тые", "тых"]: # leading number is bigger than one (две четвёртые, три пятых) input_str = input_str[-3:] + "тая" fractions = {"целая": 1} # first four numbers have little different format for num in _FRACTION_STRING_RU: # Numbers from 2 to 1 hundred, more is not usually used in common speech if num > 1: fractions[_FRACTION_STRING_RU[num]] = num if input_str.lower() in fractions: return 1.0 / fractions[input_str.lower()] return False def extract_numbers_ru(text, short_scale=True, ordinals=False): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 Returns: list: list of extracted numbers as floats """ results = _extract_numbers_with_text_ru(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] class RussianNormalizer(Normalizer): with open(resolve_resource_file("text/ru-ru/normalize.json"), encoding='utf8') as f: _default_config = json.load(f) def normalize_ru(text, remove_articles=True): """ Russian string normalization """ return RussianNormalizer().normalize(text, remove_articles) def _text_ru_inflection_normalize(word, arg): """ Russian Inflection normalizer. This try to normalize known inflection. This function is called from multiple places, each one is defined with arg. Args: word [Word] arg [Int] Returns: word [Word] """ if word in ["тысяч", "тысячи"]: return "тысяча" if arg == 1: # _extract_whole_number_with_text_ru if word in ["одна", "одним", "одно", "одной"]: return "один" if word == "две": return "два" if word == "пару": return "пара" elif arg == 2: # extract_datetime_ru if word in ["часа", "часам", "часами", "часов", "часу"]: return "час" if word in ["минут", "минутам", "минутами", "минуту", "минуты"]: return "минута" if word in ["секунд", "секундам", "секундами", "секунду", "секунды"]: return "секунда" if word in ["дней", "дни"]: return "день" if word in ["неделе", "недели", "недель"]: return "неделя" if word in ["месяца", "месяцев"]: return "месяц" if word in ["года", "лет"]: return "год" if word in _WORDS_MORNING_RU: return "утром" if word in ["полудне", "полудня"]: return "полдень" if word in _WORDS_EVENING_RU: return "вечером" if word in _WORDS_NIGHT_RU: return "ночь" if word in ["викенд", "выходным", "выходных"]: return "выходные" if word in ["столетие", "столетий", "столетия"]: return "век" # Week days if word in ["среду", "среды"]: return "среда" if word in ["пятницу", "пятницы"]: return "пятница" if word in ["субботу", "субботы"]: return "суббота" # Months if word in ["марта", "марте"]: return "март" if word in ["мае", "мая"]: return "май" if word in ["августа", "августе"]: return "август" if word[-2:] in ["ле", "ля", "не", "ня", "ре", "ря"]: tmp = word[:-1] + "ь" for name in _MONTHS_RU: if name == tmp: return name return word lingua-franca-release-v0.4.3/lingua_franca/lang/parse_sl.py000066400000000000000000000000421426211343400237210ustar00rootroot00000000000000# TODO implement parsing function lingua-franca-release-v0.4.3/lingua_franca/lang/parse_sv.py000066400000000000000000001014141426211343400237400ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local from .parse_common import (is_numeric, look_for_fractions, Normalizer, tokenize, Token) def _find_numbers_in_text(tokens): """Finds duration related numbers in texts and makes a list of mappings. The mapping will be for number to token that created it, if no number was created from the token the mapping will be from None to the token. The function is optimized to generate data that can be parsed to a duration so it returns the list in reverse order to make the "size" (minutes/hours/ etc.) come first and the related numbers afterwards. Args: tokens: Tokens to parse Returns: list of (number, token) tuples """ parts = [] for tok in tokens: res = extract_number_sv(tok.word) if res: parts.insert(0, (res, tok)) # Special case for quarter of an hour if tok.word == 'kvart': parts.insert(0, (None, Token('timmar', index=-1))) elif tok.word in ['halvtimme', 'halvtimma']: parts.insert(0, (30, tok)) parts.insert(0, (None, Token('minuter', index=-1))) else: parts.insert(0, (None, tok)) return parts def _combine_adjacent_numbers(number_map): """Combine adjacent numbers through multiplication. Walks through a number map and joins adjasent numbers to handle cases such as "en halvtimme" (one half hour). Returns: (list): simplified number_map """ simplified = [] skip = False for i in range(len(number_map) - 1): if skip: skip = False continue if number_map[i][0] and number_map[i + 1][0]: combined_number = number_map[i][0] * number_map[i + 1][0] combined_tokens = (number_map[i][1], number_map[i + 1][1]) simplified.append((combined_number, combined_tokens)) skip = True else: simplified.append((number_map[i][0], (number_map[i][1],))) if not skip: simplified.append((number_map[-1][0], (number_map[-1][1],))) return simplified def extract_duration_sv(text): """ Convert an swedish phrase into a number of seconds. The function handles durations from seconds up to days. Convert things like: "10 minute" "2 and a half hours" "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return (300, "set a timer for"). Args: text (str): string containing a duration Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ tokens = tokenize(text) number_tok_map = _find_numbers_in_text(tokens) # Combine adjacent numbers simplified = _combine_adjacent_numbers(number_tok_map) states = { 'days': 0, 'hours': 0, 'minutes': 0, 'seconds': 0 } # Parser state, mapping words that should set the parser to collect # numbers to a specific time "size" state_words = { 'days': ('dygn', 'dag', 'dagar', 'dags'), 'hours': ('timmar', 'timme', 'timma', 'timmes', 'timmas'), 'minutes': ('minuter', 'minuters', 'minut', 'minuts'), 'seconds': ('sekunder', 'sekunders', 'sekund', 'sekunds') } binding_words = ('och') consumed = [] state = None valid = False for num, toks in simplified: if state and num: states[state] += num consumed.extend(toks) valid = True # If a state field got set this is valid duration elif num is None: for s in state_words: if toks[0].word in state_words[s]: state = s consumed.extend(toks) break else: if toks[0].word not in binding_words: state = None td = timedelta(**states) remainder = ' '.join([t.word for t in tokens if t not in consumed]) return (td, remainder) if valid else None def extract_number_sv(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API # compatibility reasons. text = text.lower() aWords = text.split() and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): val = float(word) elif word == "första": val = 1 elif word == "andra": val = 2 elif word == "tredje": val = 3 elif word == "fjärde": val = 4 elif word == "femte": val = 5 elif word == "sjätte": val = 6 elif is_fractional_sv(word): val = is_fractional_sv(word) else: if word == "en": val = 1 if word == "ett": val = 1 elif word == "två": val = 2 elif word == "tre": val = 3 elif word == "fyra": val = 4 elif word == "fem": val = 5 elif word == "sex": val = 6 elif word == "sju": val = 7 elif word == "åtta": val = 8 elif word == "nio": val = 9 elif word == "tio": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_sv(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "och" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'och': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'och': and_pass = True valPreAnd = val val = False count += 3 continue break return val or False def extract_datetime_sv(text, anchorDate=None, default_time=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. """ s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' den ', ' ').replace(' en ', ' ') wordList = s.split() for idx, word in enumerate(wordList): word = word.replace("'s", "") ordinals = ["rd", "st", "nd", "th"] if word[0].isdigit(): for ordinal in ordinals: if ordinal in word: word = word.replace(ordinal, "") wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if text == "": return None anchorDate = anchorDate or now_local() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = anchorDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ['morgon', 'förmiddag', 'eftermiddag', 'kväll'] markers = ['på', 'i', 'den här', 'kring', 'efter'] days = ['måndag', 'tisdag', 'onsdag', 'torsdag', 'fredag', 'lördag', 'söndag'] months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', 'juli', 'augusti', 'september', 'oktober', 'november', 'december'] monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] words = clean_string(text) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words word = word.rstrip('s') start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "idag" and not fromFlag: dayOffset = 0 used += 1 elif word == "imorgon" and not fromFlag: dayOffset = 1 used += 1 elif word == "morgondagen" or word == "morgondagens" and not fromFlag: dayOffset = 1 used += 1 elif word == "övermorgon" and not fromFlag: dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "dag" or word == "dagar": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "vecka" or word == "veckor" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "nästa": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "förra": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "månad" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "nästa": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "förra": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "år" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "nästa": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "förra": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "nästa": dayOffset += 7 used += 1 start -= 1 elif wordPrev == "förra": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and (wordPrev[0].isdigit() or (wordPrev == "of" and wordPrevPrev[0].isdigit())): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("idag") validFollowups.append("imorgon") validFollowups.append("nästa") validFollowups.append("förra") validFollowups.append("nu") if (word == "från" or word == "efter") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "imorgon": dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "nästa": tmpOffset += 7 used += 1 start -= 1 elif wordNext == "förra": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] == "denna": start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "middag": hrAbs = 12 used += 1 elif word == "midnatt": hrAbs = 0 used += 1 elif word == "morgon": if not hrAbs: hrAbs = 8 used += 1 elif word == "förmiddag": if not hrAbs: hrAbs = 10 used += 1 elif word == "eftermiddag": if not hrAbs: hrAbs = 15 used += 1 elif word == "kväll": if not hrAbs: hrAbs = 19 used += 1 # parse half an hour, quarter hour elif wordPrev in markers or wordPrevPrev in markers: if word == "halvtimme" or word == "halvtimma": minOffset = 30 elif word == "kvart": minOffset = 15 elif word == "timme" or word == "timma": hrOffset = 1 words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif nextWord == "tonight": remainder = "pm" used += 1 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "morning": remainder = "am" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "afternoon": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "evening": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "morning": remainder = "am" used += 2 elif wordNext == "in" and wordNextNext == "afternoon": remainder = "pm" used += 2 elif wordNext == "in" and wordNextNext == "evening": remainder = "pm" used += 2 elif wordNext == "this" and wordNextNext == "morning": remainder = "am" used = 2 elif wordNext == "this" and wordNextNext == "afternoon": remainder = "pm" used = 2 elif wordNext == "this" and wordNextNext == "evening": remainder = "pm" used = 2 elif wordNext == "at" and wordNextNext == "night": if strHH > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "evening" or timeQualifier == "afternoon"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if wordNext == "pm" or wordNext == "p.m.": strHH = strNum remainder = "pm" used = 1 elif wordNext == "am" or wordNext == "a.m.": strHH = strNum remainder = "am" used = 1 elif ( int(word) > 100 and ( wordPrev == "o" or wordPrev == "oh" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "hours": used += 1 elif ( wordNext == "hours" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minutes": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "seconds": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 if wordNext == "hours": used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext used += 1 if wordNextNext == "hours": used += 1 elif ( wordNext == "" or wordNext == "o'clock" or ( wordNext == "in" and ( wordNextNext == "the" or wordNextNext == timeQualifier ) )): strHH = word strMM = 00 if wordNext == "o'clock": used += 1 if wordNext == "in" or wordNextNext == "in": used += (1 if wordNext == "in" else 2) if (wordNextNext and wordNextNext in timeQualifier or (words[words.index(wordNextNext) + 1] and words[words.index(wordNextNext) + 1] in timeQualifier)): if (wordNextNext == "afternoon" or (len(words) > words.index(wordNextNext) + 1 and words[words.index( wordNextNext) + 1] == "afternoon")): remainder = "pm" if (wordNextNext == "evening" or (len(words) > (words.index(wordNextNext) + 1) and words[words.index( wordNextNext) + 1] == "evening")): remainder = "pm" if (wordNextNext == "morning" or (len(words) > words.index(wordNextNext) + 1 and words[words.index( wordNextNext) + 1] == "morning")): remainder = "am" else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "o" or wordPrev == "oh": words[words.index(wordPrev)] = "" if wordPrev == "early": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "late": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": temp = datetime.strptime(datestr, "%B %d") if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "and" and words[idx - 1] == "" and words[ idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def is_fractional_sv(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('ars', -3): input_str = input_str[:len(input_str) - 3] # e.g. "femtedelar" if input_str.endswith('ar', -2): input_str = input_str[:len(input_str) - 2] # e.g. "femtedelar" if input_str.endswith('a', -1): input_str = input_str[:len(input_str) - 1] # e.g. "halva" if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "halva" aFrac = ["hel", "halv", "tredjedel", "fjärdedel", "femtedel", "sjättedel", "sjundedel", "åttondel", "niondel", "tiondel", "elftedel", "tolftedel"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 1) if input_str == "kvart": return 1.0 / 4 if input_str == "trekvart": return 3.0 / 4 return False def normalize_sv(text, remove_articles=True): """ English string normalization """ words = text.split() # this also removed extra spaces normalized = '' for word in words: # Convert numbers into digits, e.g. "two" -> "2" if word == 'en': word = 'ett' textNumbers = ["noll", "ett", "två", "tre", "fyra", "fem", "sex", "sju", "åtta", "nio", "tio", "elva", "tolv", "tretton", "fjorton", "femton", "sexton", "sjutton", "arton", "nitton", "tjugo"] if word in textNumbers: word = str(textNumbers.index(word)) normalized += " " + word return normalized[1:] # strip the initial space class SwedishNormalizer(Normalizer): """ TODO implement language specific normalizer""" lingua-franca-release-v0.4.3/lingua_franca/parse.py000066400000000000000000000224521426211343400223130ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from difflib import SequenceMatcher from warnings import warn from lingua_franca.time import now_local from lingua_franca.internal import populate_localized_function_dict, \ get_active_langs, get_full_lang_code, get_primary_lang_code, \ get_default_lang, localized_function, _raise_unsupported_language _REGISTERED_FUNCTIONS = ("extract_numbers", "extract_number", "extract_duration", "extract_datetime", "normalize", "get_gender", "is_fractional", "is_ordinal") populate_localized_function_dict("parse", langs=get_active_langs()) def fuzzy_match(x: str, against: str) -> float: """Perform a 'fuzzy' comparison between two strings. Returns: match percentage -- 1.0 for perfect match, down to 0.0 for no match at all. """ return SequenceMatcher(None, x, against).ratio() def match_one(query, choices): """ Find best match from a list or dictionary given an input Args: query (str): string to test choices (list): list or dictionary of choices Returns: tuple: (best match, score) """ if isinstance(choices, dict): _choices = list(choices.keys()) elif isinstance(choices, list): _choices = choices else: raise ValueError('a list or dict of choices must be provided') best = (_choices[0], fuzzy_match(query, _choices[0])) for c in _choices[1:]: score = fuzzy_match(query, c) if score > best[1]: best = (c, score) if isinstance(choices, dict): return (choices[best[0]], best[1]) else: return best @localized_function() def extract_numbers(text, short_scale=True, ordinals=False, lang=''): """ Takes in a string and extracts a list of numbers. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: list: list of extracted numbers as floats, or empty list if none found """ @localized_function() def extract_number(text, short_scale=True, ordinals=False, lang=''): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from short_scale (bool): Use "short scale" or "long scale" for large numbers -- over a million. The default is short scale, which is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: (int, float or False): The number extracted or False if the input text contains no numbers """ @localized_function() def extract_duration(text, lang=''): """ Convert an english phrase into a number of seconds Convert things like: * "10 minute" * "2 and a half hours" * "3 days 8 hours 10 minutes and 49 seconds" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. As an example, "set a timer for 5 minutes" would return ``(300, "set a timer for")``. Args: text (str): string containing a duration lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: (timedelta, str): A tuple containing the duration and the remaining text not consumed in the parsing. The first value will be None if no duration is found. The text returned will have whitespace stripped from the ends. """ @localized_function() def extract_datetime(text, anchorDate=None, lang='', default_time=None): """ Extracts date and time information from a sentence. Parses many of the common ways that humans express dates and times, including relative dates like "5 days from today", "tomorrow', and "Tuesday". Vague terminology are given arbitrary values, like: - morning = 8 AM - afternoon = 3 PM - evening = 7 PM If a time isn't supplied or implied, the function defaults to 12 AM Args: text (str): the text to be interpreted anchorDate (:obj:`datetime`, optional): the date to be used for relative dating (for example, what does "tomorrow" mean?). Defaults to the current local date/time. lang (str): the BCP-47 code for the language to use, None uses default default_time (datetime.time): time to use if none was found in the input string. Returns: [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date as a datetime object in the local timezone. 'leftover_string' is the original phrase with all date and time related keywords stripped out. See examples for further clarification Returns 'None' if no date or time related text is found. Examples: >>> extract_datetime( ... "What is the weather like the day after tomorrow?", ... datetime(2017, 6, 30, 00, 00) ... ) [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like'] >>> extract_datetime( ... "Set up an appointment 2 weeks from Sunday at 5 pm", ... datetime(2016, 2, 19, 00, 00) ... ) [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment'] >>> extract_datetime( ... "Set up an appointment", ... datetime(2016, 2, 19, 00, 00) ... ) None """ @localized_function() def normalize(text, lang='', remove_articles=True): """Prepare a string for parsing This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. remove_articles (bool): whether to remove articles (like 'a', or 'the'). True by default. Returns: (str): The normalized string. """ @localized_function() def get_gender(word, context="", lang=''): """ Guess the gender of a word Some languages assign genders to specific words. This method will attempt to determine the gender, optionally using the provided context sentence. Args: word (str): The word to look up context (str, optional): String containing word, for context lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, or None if unknown/or unused in the given language. """ @localized_function() def is_fractional(input_str, short_scale=True, lang=''): """ This function takes the given text and checks if it is a fraction. Used by most of the number exractors. Will return False on phrases that *contain* a fraction. Only detects exact matches. To pull a fraction from a string, see extract_number() Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ @localized_function() def is_ordinal(input_str, lang=''): """ This function takes the given text and checks if it is an ordinal number. Args: input_str (str): the string to check if ordinal lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: (bool) or (float): False if not an ordinal, otherwise the number corresponding to the ordinal """ lingua-franca-release-v0.4.3/lingua_franca/res/000077500000000000000000000000001426211343400214135ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/000077500000000000000000000000001426211343400223775ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/000077500000000000000000000000001426211343400233675ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/and.word000066400000000000000000000000021426211343400250160ustar00rootroot00000000000000i lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/date_time.json000066400000000000000000000064571426211343400262310ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^2\\d$", "format": "vint-i-{x}"}, "5": {"match": "^[3-9]\\d$", "format": "{x0}-{x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "{x_in_x00}-cent"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00}-cents"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^10\\d\\d$", "format": "mil"}, "2": {"match": "^11\\d\\d$", "format": "mil cent"}, "3": {"match": "^1[2-9]\\d\\d$", "format": "mil {x_in_x00}-cents"}, "4": {"match": "^[2-9]0\\d{2}$", "format": "{x_in_x000} mil"}, "5": {"match": "^[2-9]1\\d{2}$", "format": "{x_in_x000} mil cent"}, "6": {"match": "^[2-9][2-9]\\d{2}$", "format": "{x_in_x000} mil {x_in_x00}-cents"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "a.C." }, "date_format": { "date_full": "{weekday}, {day} de {month} de {formatted_year}", "date_full_no_year": "{weekday}, {day} de {month}", "date_full_no_year_month": "{weekday}, dia {day}", "today": "avui", "tomorrow": "demà", "yesterday": "ahir" }, "date_time_format": { "date_time": "{formatted_date} a {formatted_time}" }, "weekday": { "0": "dilluns", "1": "dimarts", "2": "dimecres", "3": "dijous", "4": "divendres", "5": "dissabte", "6": "diumenge" }, "date": { "1": "primer", "2": "dos", "3": "tres", "4": "quatre", "5": "cinc", "6": "sis", "7": "set", "8": "vuit", "9": "nou", "10": "deu", "11": "onze", "12": "dotze", "13": "tretze", "14": "catorze", "15": "quinze", "16": "setze", "17": "disset", "18": "divuit", "19": "dinou", "20": "vint", "21": "vint-i-u", "22": "vint-i-dos", "23": "vint-i-tres", "24": "vint-i-quatre", "25": "vint-i-cinc", "26": "vint-i-sis", "27": "vint-i-set", "28": "vint-i-vuit", "29": "vint-i-nou", "30": "trenta", "31": "trenta-u" }, "month": { "1": "gener", "2": "febrer", "3": "març", "4": "abril", "5": "maig", "6": "juny", "7": "juliol", "8": "agost", "9": "setembre", "10": "octubre", "11": "novembre", "12": "desembre" }, "number": { "0": "zero", "1": "u", "2": "dos", "3": "tres", "4": "quatre", "5": "cinc", "6": "sis", "7": "set", "8": "vuit", "9": "nou", "10": "deu", "11": "onze", "12": "dotze", "13": "tretze", "14": "catorze", "15": "quinze", "16": "setze", "17": "disset", "18": "divuit", "19": "dinou", "20": "vint", "30": "trenta", "40": "quaranta", "50": "cinquanta", "60": "seixanta", "70": "setanta", "80": "vuitanta", "90": "noranta" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/date_time_test.json000066400000000000000000000100201426211343400272450ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "u a.C." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deu a.C." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "noranta-dos a.C." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vuit-cents tres" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vuit-cents onze" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quatre-cents cinquanta-quatre" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cinc" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil dotze" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil quaranta-sis" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil vuit-cents set" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil set-cents disset" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil nou-cents vuitanta-vuit"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil nou"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil divuit"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil vint-i-u"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil trenta"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dos mil cent" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dos mil" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil cent vint a.C." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil dos-cents quaranta-u a.C." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinc mil dos-cents" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cent" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dos mil cent" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "dimarts, trenta-u de gener de dos mil disset"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer de dos mil divuit"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "diumenge, dia quatre"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "demà"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "avui"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "ahir"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "diumenge, quatre de febrer de dos mil divuit"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "dimarts, trenta-u de gener de dos mil disset a la una i vint-i-dos de la tarda"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "dimarts, trenta-u de gener de dos mil disset a les tretze i vint-i-dos"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/day.word000066400000000000000000000000041426211343400250330ustar00rootroot00000000000000dia lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/days.word000066400000000000000000000000051426211343400252170ustar00rootroot00000000000000dies lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/hour.word000066400000000000000000000000051426211343400252340ustar00rootroot00000000000000hora lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/hours.word000066400000000000000000000000061426211343400254200ustar00rootroot00000000000000hores lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/minute.word000066400000000000000000000000061426211343400255610ustar00rootroot00000000000000minut lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/minutes.word000066400000000000000000000000071426211343400257450ustar00rootroot00000000000000minuts lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/normalize.json000066400000000000000000000036771426211343400262770ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": false, "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": true, "contractions": {}, "word_replacements": {}, "number_replacements": { "catorze": "14", "cent": "100", "cents": "100", "cinc": "5", "cinc-centes": "500", "cinc-cents": "500", "cinquanta": "50", "deu": "10", "dinou": "19", "setze": "16", "disset": "17", "dihuit": "18", "divuit": "18", "dos": "2", "dos-centes": "200", "dos-cents": "200", "dotze": "12", "dues": "2", "dues-centes": "200", "huitanta": "80", "huit": "8", "huit-centes": "800", "huit-cents": "800", "mil": "1000", "milió": "1000000", "nou": "9", "nou-centes": "900", "nou-cents": "900", "noranta": "90", "onze": "11", "primer": "1", "primera": "1", "quaranta": "40", "quatre": "4", "quatre-centes": "400", "quatre-cents": "400", "quinze": "15", "segon": "2", "segona": "2", "seixanta": "60", "set": "7", "set-centes": "700", "set-cents": "700", "setanta": "70", "sis": "6", "sis-centes": "600", "sis-cents": "600", "tercer": "3", "trenta": "30", "tres": "3", "tres-centes": "300", "tres-cents": "300", "tretze": "13", "u": "1", "un": "1", "una": "1", "vint": "20", "vuitanta": "80", "vuit": "8", "vuit-centes": "800", "vuit-cents": "800", "zero": "0" }, "stopwords": [ "de", "del", "dels", "ell", "ella", "ells", "elles", "jo", "i", "al", "dins la", "a la", "nosaltres", "dins el", "para", "aquest", "aquesta", "aquests", "aquestes", "aquell", "aquella", "aquells", "aquelles", "que" ], "articles": [ "el", "la", "l", "els", "les", "los" ] } lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/or.word000066400000000000000000000000021426211343400246740ustar00rootroot00000000000000o lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/second.word000066400000000000000000000000061426211343400255330ustar00rootroot00000000000000segon lingua-franca-release-v0.4.3/lingua_franca/res/text/ca-es/seconds.word000066400000000000000000000000071426211343400257170ustar00rootroot00000000000000segons lingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/000077500000000000000000000000001426211343400234165ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/and.word000066400000000000000000000000011426211343400250440ustar00rootroot00000000000000alingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/date_time.json000066400000000000000000000071231426211343400262470ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{3}$", "format": "{x_in_x00} sto"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tisíc"}, "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} sto"}, "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} sto"}, "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "b.c." }, "date_format": { "date_full": "{weekday}, {month} {day}, {formatted_year}", "date_full_no_year": "{weekday}, {month} {day}", "date_full_no_year_month": "{weekday}, {day}", "today": "dnes", "tomorrow": "zítra", "yesterday": "včera" }, "date_time_format": { "date_time": "{formatted_date} v {formatted_time}" }, "weekday": { "0": "pondělí", "1": "úterý", "2": "středa", "3": "čtvrtek", "4": "pátek", "5": "sobota", "6": "neděle" }, "date": { "1": "prvního", "2": "druhého", "3": "třetího", "4": "čtvrtého", "5": "pátého", "6": "šestého", "7": "sedmého", "8": "osmého", "9": "devátého", "10": "desátého", "11": "jedenáctého", "12": "dvanáctého", "13": "třináctého", "14": "čtrnáctého", "15": "patnáctého", "16": "šestnáctého", "17": "sedmnáctého", "18": "osmnáctého", "19": "devatenáctého", "20": "dvacátého", "21": "dvacátého-prvního", "22": "dvacátého-druhého", "23": "dvacátého-třetího", "24": "dvacátého-čtvrtého", "25": "dvacátého-pátého", "26": "dvacátého-šestého", "27": "dvacátého-sedmého", "28": "dvacátého-osmého", "29": "dvacátého-devátého", "30": "třicátého", "31": "třicátého-prvního" }, "month": { "1": "leden", "2": "únor", "3": "březen", "4": "duben", "5": "květen", "6": "červen", "7": "červenec", "8": "srpen", "9": "září", "10": "říjen", "11": "listopad", "12": "prosinec" }, "number": { "0": "nula", "1": "jedna", "2": "dva", "3": "tři", "4": "čtyři", "5": "pět", "6": "šest", "7": "sedm", "8": "osm", "9": "devět", "10": "deset", "11": "jedenáct", "12": "dvanáct", "13": "třináct", "14": "čtrnáct", "15": "patnáct", "16": "šestnáct", "17": "sedmnáct", "18": "osmnáct", "19": "devatenáct", "20": "dvacet", "30": "třicet", "40": "čtyřicet", "50": "padesát", "60": "šedesát", "70": "sedmdesát", "80": "osmdesát", "90": "devadesát" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/date_time_test.json000066400000000000000000000101221426211343400272770ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "jedna b.c." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deset b.c." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "devadesát dva b.c." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osm sto tři" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osm sto jedenáct" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "čtyři sto padesát čtyři" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "jedna tisíc pět" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deset dvanáct" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deset čtyřicet šest" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osmnáct sedm" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "sedmnáct sedmnáct" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "devatenáct osmdesát osm"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisíc devět"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet osmnáct"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet dvacet jedna"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dvacet třicet"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dvacet jedna sto" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "jedna tisíc" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisíc" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "třicet jedna dvacet b.c." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "třicet dva čtyřicet jedna b.c." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "padesát dva sto" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "jedenáct sto" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dvacet jedna sto" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého, dvacet osmnáct"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "neděle, čtvrtého"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "zítra"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "dnes"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "včera"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "neděle, únor čtvrtého, dvacet osmnáct"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct v jedna dvacet dva p.m."}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "úterý, leden třicátého-prvního, dvacet sedmnáct v třináct dvacet dva"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/day.word000066400000000000000000000000031426211343400250610ustar00rootroot00000000000000denlingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/days.word000066400000000000000000000000041426211343400252450ustar00rootroot00000000000000dnílingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/hour.word000066400000000000000000000000061426211343400252640ustar00rootroot00000000000000hodinalingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/hours.word000066400000000000000000000000061426211343400254470ustar00rootroot00000000000000hodinylingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/minute.word000066400000000000000000000000061426211343400256100ustar00rootroot00000000000000minutalingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/minutes.word000066400000000000000000000000061426211343400257730ustar00rootroot00000000000000minutylingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/normalize.json000066400000000000000000000016621426211343400263160ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, "remove_symbols": false, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, "contractions": {}, "word_replacements": {}, "number_replacements": { "nula": "0", "jedna": "1", "dva": "2", "dvě": "2", "tři": "3", "čtyři": "4", "pět": "5", "šest": "6", "sedm": "7", "sedum": "7", "osm": "8", "osum": "8", "devět": "9", "deset": "10", "jedenáct": "11", "dvanáct": "12", "třináct": "13", "čtrnáct": "14", "patnáct": "15", "šestnáct": "16", "sedmnáct": "17", "osmnáct": "18", "devatenáct": "19", "dvacet": "20", "třicet": "30", "čtyřicet": "40", "padesát": "50", "šedesát": "60", "sedmdesát": "70", "osmdesát": "80", "devadesát": "90" }, "stopwords": [], "articles": [] }lingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/or.word000066400000000000000000000000041426211343400247250ustar00rootroot00000000000000nebolingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/second.word000066400000000000000000000000071426211343400255630ustar00rootroot00000000000000sekundalingua-franca-release-v0.4.3/lingua_franca/res/text/cs-cz/seconds.word000066400000000000000000000000071426211343400257460ustar00rootroot00000000000000sekundylingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/000077500000000000000000000000001426211343400233575ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/and.word000066400000000000000000000000021426211343400250060ustar00rootroot00000000000000oglingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/date_time.json000066400000000000000000000065571426211343400262220ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^1$", "format": "et"}, "2": {"match": "^\\d$", "format": "{x}"}, "3": {"match": "^1\\d$", "format": "{xx}"}, "4": {"match": "^\\d0$", "format": "{x0}"}, "5": {"match": "^[2-9]\\d$", "format": "{x} og {x0}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "et hundred"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^1[1-9]\\d{2}$", "format": "{xx_in_xx00} hundred"}, "2": {"match": "^1\\d{3}$", "format": "et tusind"}, "3": {"match": "^\\d{4}$", "format": "{x_in_x000} tusind"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} og {formatted_decade} {bc}"}, "4": {"match": "^(1\\d00)|([2-9]000)$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} og {formatted_decade} {bc}"}, "6": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_hundreds} og {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "f.kr." }, "date_format": { "date_full": "{weekday}, den {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, den {day} {month}", "date_full_no_year_month": "{weekday}, den {day}", "today": "i dag", "tomorrow": "i morgen", "yesterday": "i går" }, "date_time_format": { "date_time": "{formatted_date} klokken {formatted_time}" }, "weekday": { "0": "mandag", "1": "tirsdag", "2": "onsdag", "3": "torsdag", "4": "fredag", "5": "lørdag", "6": "søndag" }, "date": { "1": "første", "2": "anden", "3": "tredie", "4": "fjerde", "5": "femte", "6": "sjette", "7": "syvende", "8": "ottende", "9": "ninende", "10": "tiende", "11": "elvte", "12": "tolvte", "13": "trettende", "14": "fjortende", "15": "femtende", "16": "sekstende", "17": "syttende", "18": "attende", "19": "nittende", "20": "tyvende", "21": "en og tyvende", "22": "to og tyvende", "23": "tre og tyvende", "24": "fire og tyvende", "25": "fem og tyvende", "26": "seks og tyvende", "27": "syv og tyvende", "28": "otte og tyvende", "29": "ni og tyvende", "30": "tredivte", "31": "en og tredivte" }, "month": { "1": "januar", "2": "februar", "3": "marts", "4": "april", "5": "maj", "6": "juni", "7": "juli", "8": "august", "9": "september", "10": "oktober", "11": "november", "12": "december" }, "number": { "0": "nul", "1": "en", "2": "to", "3": "tre", "4": "fire", "5": "fem", "6": "seks", "7": "syv", "8": "otte", "9": "ni", "10": "ti", "11": "elve", "12": "tolv", "13": "tretten", "14": "fjorten", "15": "femten", "16": "seksten", "17": "sytten", "18": "atten", "19": "nitten", "20": "tyve", "30": "tredive", "40": "fyrre", "50": "halvtreds", "60": "treds", "70": "halvfjerds", "80": "firs", "90": "halvfems", "100": "hundrede", "1000": "tusind", "2000": "to tusind" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/date_time_test.json000066400000000000000000000056661426211343400272610ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "to tusind og sytten"}, "2": {"datetime_param": "1984, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitten hundred og fire og firs"}, "3": {"datetime_param": "1906, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitten hundred og seks"}, "4": {"datetime_param": "1802, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "atten hundred og to" }, "5": {"datetime_param": "806, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "otte hundred og seks" }, "6": {"datetime_param": "1800, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "atten hundred" }, "7": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et" }, "8": {"datetime_param": "103, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et hundred og tre" }, "9": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "et tusind" }, "10": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "to tusind" }, "11": {"datetime_param": "99, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ni og halvfems f.kr." }, "12": {"datetime_param": "5, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "fem f.kr." }, "13": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tre tusind et hundred og tyve f.kr." }, "14": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tre tusind to hundred og en og fyrre f.kr." } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tirsdag, den en og tredivte januar, to tusind og sytten"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde februar, to tusind og atten"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde februar"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "søndag, den fjerde"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "i morgen"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "i dag"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "i går"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "søndag, den fjerde februar"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "søndag, den fjerde februar, to tusind og atten"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tirsdag, den en og tredivte januar, to tusind og sytten klokken et toogtyve om eftermiddagen"} } }lingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/day.word000066400000000000000000000000031426211343400250220ustar00rootroot00000000000000daglingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/days.word000066400000000000000000000000041426211343400252060ustar00rootroot00000000000000dagelingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/hour.word000066400000000000000000000000041426211343400252230ustar00rootroot00000000000000timelingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/hours.word000066400000000000000000000000051426211343400254070ustar00rootroot00000000000000timerlingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/minute.word000066400000000000000000000000051426211343400255500ustar00rootroot00000000000000minutlingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/minutes.word000066400000000000000000000000071426211343400257350ustar00rootroot00000000000000minuterlingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/or.word000066400000000000000000000000051426211343400246670ustar00rootroot00000000000000ellerlingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/second.word000066400000000000000000000000061426211343400255230ustar00rootroot00000000000000sekundlingua-franca-release-v0.4.3/lingua_franca/res/text/da-dk/seconds.word000066400000000000000000000000101426211343400257010ustar00rootroot00000000000000sekunderlingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/000077500000000000000000000000001426211343400233555ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/and.word000066400000000000000000000000041426211343400250060ustar00rootroot00000000000000und lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/date_time.json000066400000000000000000000100051426211343400261770ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x} und {x0}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "hundert"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00} hundert"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^10\\d\\d$", "format": "tausend"}, "2": {"match": "^\\d0\\d{2}$", "format": "{x_in_x000} tausend"}, "3": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundert"}, "4": {"match": "^\\d{2}00$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, "5": {"match": "^\\d0\\d\\d$", "format": "{x_in_x000} tausend"}, "6": {"match": "^1\\d{3}$", "format": "{xx_in_xx00}"}, "7": {"match": "^\\d{4}$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, "default": "{number}" }, "year_format": { "1": {"match": "^1$", "format": "eins {bc}"}, "2": {"match": "^\\d{1}?$", "format": "{formatted_decade} {bc}"}, "3": {"match": "^\\d{2}?$", "format": "{formatted_decade} {bc}"}, "4": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "5": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "7": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "8": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, "9": {"match": "^1[2-9]\\d{2}$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, "10": {"match": "^1\\d{3}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "11": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "v.d.Z." }, "date_format": { "date_full": "{weekday}, {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "heute", "tomorrow": "morgen", "yesterday": "gestern" }, "date_time_format": { "date_time": "{formatted_date} um {formatted_time}" }, "weekday": { "0": "Montag", "1": "Dienstag", "2": "Mittwoch", "3": "Donnerstag", "4": "Freitag", "5": "Samstag", "6": "Sonntag" }, "date": { "1": "erster", "2": "zweiter", "3": "dritter", "4": "vierter", "5": "fünfter", "6": "sechster", "7": "siebter", "8": "achter", "9": "neunter", "10": "zehnter", "11": "elfter", "12": "zwölfter", "13": "dreizehnter", "14": "vierzehnter", "15": "fünfzehnter", "16": "sechzehnter", "17": "siebzehnter", "18": "achtzehnter", "19": "neunzehnter", "20": "zwanzigster", "21": "einundzwanzigster", "22": "zweiundzwanzigster", "23": "dreiundzwanzigster", "24": "vierundzwanzigster", "25": "fünfundzwanzigster", "26": "sechsundzwanzigster", "27": "siebenundzwanzigster", "28": "achtundzwanzigster", "29": "neunundzwanzigster", "30": "dreißigster", "31": "einunddreißigster" }, "month": { "1": "Januar", "2": "Februar", "3": "März", "4": "April", "5": "Mai", "6": "Juni", "7": "Juli", "8": "August", "9": "September", "10": "Oktober", "11": "November", "12": "Dezember" }, "number": { "0": "null", "1": "ein", "2": "zwei", "3": "drei", "4": "vier", "5": "fünf", "6": "sechs", "7": "sieben", "8": "acht", "9": "neun", "10": "zehn", "11": "elf", "12": "zwölf", "13": "dreizehn", "14": "vierzehn", "15": "fünfzehn", "16": "sechzehn", "17": "siebzehn", "18": "achtzehn", "19": "neunzehn", "20": "zwanzig", "30": "dreißig", "40": "vierzig", "50": "fünfzig", "60": "sechzig", "70": "siebzig", "80": "achtzig", "90": "neunzig" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/date_time_test.json000066400000000000000000000103461426211343400272460ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "eins v.d.Z." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zehn v.d.Z." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zwei und neunzig v.d.Z." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert drei" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert elf" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vier hundert vier und fünfzig" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend fünf" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend zwölf" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend sechs und vierzig" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "achtzehn hundert sieben" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "siebzehn hundert siebzehn" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "neunzehn hundert acht und achtzig"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend neun"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend achtzehn"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend ein und zwanzig"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend dreißig"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tausend" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend ein hundert zwanzig v.d.Z." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend zwei hundert ein und vierzig v.d.Z." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fünf tausend zwei hundert" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elf hundert" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "morgen"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "heute"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "gestern"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um ein Uhr zweiundzwanzig nachmittags"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um dreizehn Uhr zweiundzwanzig"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/day.word000066400000000000000000000000041426211343400250210ustar00rootroot00000000000000Tag lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/days.word000066400000000000000000000000051426211343400252050ustar00rootroot00000000000000Tage lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/hour.word000066400000000000000000000000071426211343400252240ustar00rootroot00000000000000Stunde lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/hours.word000066400000000000000000000000101426211343400254010ustar00rootroot00000000000000Stunden lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/minute.word000066400000000000000000000000071426211343400255500ustar00rootroot00000000000000Minute lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/minutes.word000066400000000000000000000000101426211343400257250ustar00rootroot00000000000000Minuten lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/or.word000066400000000000000000000000051426211343400246650ustar00rootroot00000000000000oder lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/second.word000066400000000000000000000000101426211343400255140ustar00rootroot00000000000000Sekunde lingua-franca-release-v0.4.3/lingua_franca/res/text/de-de/seconds.word000066400000000000000000000000111426211343400257000ustar00rootroot00000000000000Sekunden lingua-franca-release-v0.4.3/lingua_franca/res/text/en-au/000077500000000000000000000000001426211343400234045ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/en-au/date_time.json000066400000000000000000000066411426211343400262410ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} thousand"}, "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "b.c." }, "date_format": { "date_full": "{weekday}, {month} {day}, {formatted_year}", "date_full_no_year": "{weekday}, {month} {day}", "date_full_no_year_month": "{weekday}, {day}", "today": "today", "tomorrow": "tomorrow", "yesterday": "yesterday" }, "date_time_format": { "date_time": "{formatted_date} at {formatted_time}" }, "weekday": { "0": "monday", "1": "tuesday", "2": "wednesday", "3": "thursday", "4": "friday", "5": "saturday", "6": "sunday" }, "date": { "1": "first", "2": "second", "3": "third", "4": "fourth", "5": "fifth", "6": "sixth", "7": "seventh", "8": "eighth", "9": "ninth", "10": "tenth", "11": "eleventh", "12": "twelfth", "13": "thirteenth", "14": "fourteenth", "15": "fifteenth", "16": "sixteenth", "17": "seventeenth", "18": "eighteenth", "19": "nineteenth", "20": "twentieth", "21": "twenty-first", "22": "twenty-second", "23": "twenty-third", "24": "twenty-fourth", "25": "twenty-fifth", "26": "twenty-sixth", "27": "twenty-seventh", "28": "twenty-eighth", "29": "twenty-ninth", "30": "thirtieth", "31": "thirty-first" }, "month": { "1": "january", "2": "february", "3": "march", "4": "april", "5": "may", "6": "june", "7": "july", "8": "august", "9": "september", "10": "october", "11": "november", "12": "december" }, "number": { "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty", "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy", "80": "eighty", "90": "ninety" } }lingua-franca-release-v0.4.3/lingua_franca/res/text/en-au/date_time_test.json000066400000000000000000000100371426211343400272720ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "one b.c." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ten b.c." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ninety two b.c." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred three" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred eleven" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "four hundred fifty four" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "one thousand five" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten twelve" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten forty six" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eighteen oh seven" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "seventeen seventeen" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nineteen eighty eight"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand nine"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty eighteen"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty twenty one"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty thirty"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "one thousand" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty one twenty b.c." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty two forty one b.c." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fifty two hundred" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "eleven hundred" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tuesday, january thirty-first, twenty seventeen"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "sunday, fourth"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "tomorrow"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "today"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "yesterday"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tuesday, january thirty-first, twenty seventeen at one twenty two p.m."}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tuesday, january thirty-first, twenty seventeen at thirteen twenty two"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/000077500000000000000000000000001426211343400234265ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/and.word000066400000000000000000000000031426211343400250560ustar00rootroot00000000000000andlingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/date_time.json000066400000000000000000000066461426211343400262700ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} thousand"}, "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "b.c." }, "date_format": { "date_full": "{weekday}, {month} {day}, {formatted_year}", "date_full_no_year": "{weekday}, {month} {day}", "date_full_no_year_month": "{weekday}, the {day}", "today": "today", "tomorrow": "tomorrow", "yesterday": "yesterday" }, "date_time_format": { "date_time": "{formatted_date} at {formatted_time}" }, "weekday": { "0": "monday", "1": "tuesday", "2": "wednesday", "3": "thursday", "4": "friday", "5": "saturday", "6": "sunday" }, "date": { "1": "first", "2": "second", "3": "third", "4": "fourth", "5": "fifth", "6": "sixth", "7": "seventh", "8": "eighth", "9": "ninth", "10": "tenth", "11": "eleventh", "12": "twelfth", "13": "thirteenth", "14": "fourteenth", "15": "fifteenth", "16": "sixteenth", "17": "seventeenth", "18": "eighteenth", "19": "nineteenth", "20": "twentieth", "21": "twenty-first", "22": "twenty-second", "23": "twenty-third", "24": "twenty-fourth", "25": "twenty-fifth", "26": "twenty-sixth", "27": "twenty-seventh", "28": "twenty-eighth", "29": "twenty-ninth", "30": "thirtieth", "31": "thirty-first" }, "month": { "1": "january", "2": "february", "3": "march", "4": "april", "5": "may", "6": "june", "7": "july", "8": "august", "9": "september", "10": "october", "11": "november", "12": "december" }, "number": { "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty", "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy", "80": "eighty", "90": "ninety" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/date_time_test.json000066400000000000000000000100431426211343400273110ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "one b.c." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ten b.c." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ninety two b.c." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred three" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eight hundred eleven" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "four hundred fifty four" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "one thousand five" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten twelve" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ten forty six" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "eighteen oh seven" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "seventeen seventeen" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nineteen eighty eight"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand nine"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty eighteen"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty twenty one"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twenty thirty"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "one thousand" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "two thousand" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty one twenty b.c." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "thirty two forty one b.c." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fifty two hundred" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "eleven hundred" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twenty one hundred" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tuesday, january thirty-first, twenty seventeen"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "sunday, february fourth"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "sunday, the fourth"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "tomorrow"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "today"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "yesterday"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "sunday, february fourth, twenty eighteen"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tuesday, january thirty-first, twenty seventeen at one twenty two p.m."}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tuesday, january thirty-first, twenty seventeen at thirteen twenty two"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/day.word000066400000000000000000000000031426211343400250710ustar00rootroot00000000000000daylingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/days.word000066400000000000000000000000041426211343400252550ustar00rootroot00000000000000dayslingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/hour.word000066400000000000000000000000041426211343400252720ustar00rootroot00000000000000hourlingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/hours.word000066400000000000000000000000051426211343400254560ustar00rootroot00000000000000hourslingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/minute.word000066400000000000000000000000061426211343400256200ustar00rootroot00000000000000minutelingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/minutes.word000066400000000000000000000000071426211343400260040ustar00rootroot00000000000000minuteslingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/normalize.json000066400000000000000000000064261426211343400263310ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, "remove_symbols": false, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, "contractions": { "I'd": "I would", "I'll": "I will", "I'm": "I am", "I've": "I have", "ain't": "is not", "aren't": "are not", "can't": "can not", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "gonna": "going to", "gotta": "got to", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is", "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is", "might've": "might have", "mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not", "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "somebody's": "somebody is", "someone'd": "someone would", "someone'll": "someone will", "someone's": "someone is", "that'd": "that would", "that'll": "that will", "that's": "that is", "there'd": "there would", "there're": "there are", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'd": "what did", "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have", "whats": "what is", "when'd": "when did", "when's": "when is", "where'd": "where did", "where's": "where is", "where've": "where have", "who'd": "who would", "who'd've": "who would have", "who'll": "who will", "who're": "who are", "who's": "who is", "who've": "who have", "why'd": "why did", "why're": "why are", "why's": "why is", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'ain't": "you are not", "y'aint": "you are not", "y'all": "you all", "ya'll": "you all", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you're": "you are", "you've": "you have" }, "word_replacements": {}, "number_replacements": { "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90" }, "stopwords": [], "articles": [ "the", "a", "an" ] }lingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/or.word000066400000000000000000000000021426211343400247330ustar00rootroot00000000000000orlingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/second.word000066400000000000000000000000061426211343400255720ustar00rootroot00000000000000secondlingua-franca-release-v0.4.3/lingua_franca/res/text/en-us/seconds.word000066400000000000000000000000071426211343400257560ustar00rootroot00000000000000secondslingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/000077500000000000000000000000001426211343400234135ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/day.word000066400000000000000000000000051426211343400250600ustar00rootroot00000000000000día lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/days.word000066400000000000000000000000061426211343400252440ustar00rootroot00000000000000días lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/hour.word000066400000000000000000000000051426211343400252600ustar00rootroot00000000000000hora lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/hours.word000066400000000000000000000000061426211343400254440ustar00rootroot00000000000000horas lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/minute.word000066400000000000000000000000071426211343400256060ustar00rootroot00000000000000minuto lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/minutes.word000066400000000000000000000000101426211343400257630ustar00rootroot00000000000000minutos lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/second.word000066400000000000000000000000101426211343400255520ustar00rootroot00000000000000segundo lingua-franca-release-v0.4.3/lingua_franca/res/text/es-es/seconds.word000066400000000000000000000000111426211343400257360ustar00rootroot00000000000000segundos lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/000077500000000000000000000000001426211343400234175ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/date_time.json000066400000000000000000000044411426211343400262500ustar00rootroot00000000000000{ "decade_format": { "default": "{number}" }, "hundreds_format": { "default": "{number}" }, "thousand_format": { "default": "{number}" }, "year_format": { "default": "{year} {bc}", "bc": "k.a." }, "date_format": { "date_full": "{weekday}, {formatted_year}ko {month}ren {day}", "date_full_no_year": "{weekday}, {month}ren {day}", "date_full_no_year_month": "{weekday}, {day}", "today": "gaur", "tomorrow": "bihar", "yesterday": "atzo" }, "date_time_format": { "date_time": "{formatted_date} {formatted_time}tan" }, "weekday": { "0": "astelehena", "1": "asteartea", "2": "asteazkena", "3": "osteguna", "4": "ostirala", "5": "larunbata", "6": "igandea" }, "date": { "1": "bat", "2": "bi", "3": "hiru", "4": "lau", "5": "bost", "6": "sei", "7": "zazpi", "8": "zortzi", "9": "bederatzi", "10": "hamar", "11": "hamaika", "12": "hamabi", "13": "hamahiru", "14": "hamalau", "15": "hamabost", "16": "hamasei", "17": "hamazazpi", "18": "hemezortzi", "19": "hemeretzi", "20": "hogei", "21": "hogeita bat", "22": "hogeita bi", "23": "hogeita hiru", "24": "hogeita lau", "25": "hogeita bost", "26": "hogeita sei", "27": "hogeita zazpi", "28": "hogeita zorti", "29": "hogeita bederatzi", "30": "hogeita hamar", "31": "hogeita hamaika" }, "month": { "1": "urtarrila", "2": "otsaila", "3": "martxoa", "4": "apirila", "5": "maiatza", "6": "ekaina", "7": "uztaila", "8": "abuztua", "9": "iraula", "10": "urria", "11": "azaroa", "12": "abendua" }, "number": { "0": "zero", "1": "bat", "2": "bi", "3": "hiru", "4": "lau", "5": "bost", "6": "sei", "7": "zazpi", "8": "zortzi", "9": "bederatzi", "10": "hamar", "11": "hamaika", "12": "hamabi", "13": "hamahiru", "14": "hamalau", "15": "hamabost", "16": "hamasei", "17": "hamazazpi", "18": "hemezortzi", "19": "hemeretzi", "20": "hogei", "30": "hogeita hamar", "40": "berrogei", "50": "berrogeita hamar", "60": "hirurogei", "70": "hirurogeita hamar", "80": "laurogei", "90": "laurogeita hamar" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/day.word000066400000000000000000000000051426211343400250640ustar00rootroot00000000000000egun lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/days.word000066400000000000000000000000051426211343400252470ustar00rootroot00000000000000egun lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/hour.word000066400000000000000000000000051426211343400252640ustar00rootroot00000000000000ordu lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/hours.word000066400000000000000000000000051426211343400254470ustar00rootroot00000000000000ordu lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/minute.word000066400000000000000000000000071426211343400256120ustar00rootroot00000000000000minutu lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/minutes.word000066400000000000000000000000071426211343400257750ustar00rootroot00000000000000minutu lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/second.word000066400000000000000000000000101426211343400255560ustar00rootroot00000000000000segundo lingua-franca-release-v0.4.3/lingua_franca/res/text/eu-eu/seconds.word000066400000000000000000000000101426211343400257410ustar00rootroot00000000000000segundu lingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/000077500000000000000000000000001426211343400233755ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/and.word000066400000000000000000000000021426211343400250240ustar00rootroot00000000000000وlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/date_time.json000066400000000000000000000077241426211343400262350ustar00rootroot00000000000000{ "decade_format": { "1": { "match": "^\\d$", "format": "{x}" }, "2": { "match": "^1\\d$", "format": "{xx}" }, "3": { "match": "^\\d0$", "format": "{x0}" }, "4": { "match": "^[2-9]\\d$", "format": "{x0} {x}" }, "default": "{number}" }, "hundreds_format": { "1": { "match": "^\\d{3}$", "format": "{x_in_x00} hundred" }, "default": "{number}" }, "thousand_format": { "1": { "match": "^\\d00\\d$", "format": "{x_in_x000} thousand" }, "2": { "match": "^1\\d00$", "format": "{xx_in_xx00} hundred" }, "3": { "match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred" }, "4": { "match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}" }, "5": { "match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}" }, "default": "{number}" }, "year_format": { "1": { "match": "^\\d\\d?$", "format": "{formatted_decade} {bc}" }, "2": { "match": "^\\d00$", "format": "{formatted_hundreds} {bc}" }, "3": { "match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}" }, "4": { "match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}" }, "5": { "match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}" }, "6": { "match": "^\\d{2}0\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}" }, "7": { "match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}" }, "default": "{year} {bc}", "bc": "بعد از میلاد" }, "date_format": { "date_full": "{weekday}, {day} {month} {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "امروز", "tomorrow": "فردا", "yesterday": "دیروز" }, "date_time_format": { "date_time": "{formatted_date} ساعت {formatted_time}" }, "weekday": { "0": "دوشنبه", "1": "سه شنبه", "2": "چهارشنبه", "3": "پنج شنبه", "4": "جمعه", "5": "شنبه", "6": "یکشنبه" }, "date": { "1": "یکم", "2": "دوم", "3": "سوم", "4": "چهارم", "5": "پنجم", "6": "ششم", "7": "هفتم", "8": "هشتم", "9": "نهم", "10": "دهم", "11": "یازدهم", "12": "دوازدهم", "13": "سیزدهم", "14": "چهاردهم", "15": "پونزدهم", "16": "شونزدهم", "17": "هیفدهم", "18": "هیجدهم", "19": "نوزدهم", "20": "بیستم", "21": "بیست و یکم", "22": "بیست و دوم", "23": "بیست و سوم", "24": "بیست و چهارم", "25": "بیست و پنجم", "26": "بیست و ششم", "27": "بیست و هفتم", "28": "بیست و هشتم", "29": "بیست و نهم", "30": "سیم", "31": "سی و یکم" }, "month": { "1": "ژانویه", "2": "فوریه", "3": "مارس", "4": "آوریل", "5": "مه", "6": "جون", "7": "جولای", "8": "آگوست", "9": "سپتامبر", "10": "اکتبر", "11": "نوامبر", "12": "دسامبر" }, "number": { "0": "صفر", "1": "یک", "2": "دو", "3": "سه", "4": "چهار", "5": "پنج", "6": "شش", "7": "هفت", "8": "هشت", "9": "نه", "10": "ده", "11": "یازده", "12": "دوازده", "13": "سیزده", "14": "چهارده", "15": "پونزده", "16": "شونزده", "17": "هیفده", "18": "هیجده", "19": "نوزده", "20": "بیست", "30": "سی", "40": "چهل", "50": "پنجاه", "60": "شصت", "70": "هفتاد", "80": "هشتاد", "90": "نود" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/date_time_test.json000066400000000000000000000071711426211343400272700ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "یک بعد از میلاد" }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ده بعد از میلاد" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده دوازده" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده چهل و شش" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیجده صفر هفت" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیفده هیفده" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "نوزده هشتاد و هشت"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و نه"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست هیجده"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست بیست و یک"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست سی"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "دو هزار و صد" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و یک بیست بعد از میلاد" }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و دو چهل و یک بعد از میلاد" }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "پنجاه و دو هزار" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "فردا"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "امروز"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "دیروز"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت یک و بیست و دو دقیقه بعد از ظهر"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت سیزده و بیست و دو دقیقه"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/day.word000066400000000000000000000000061426211343400250430ustar00rootroot00000000000000روزlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/days.word000066400000000000000000000000061426211343400252260ustar00rootroot00000000000000روزlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/hour.word000066400000000000000000000000101426211343400252360ustar00rootroot00000000000000ساعتlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/hours.word000066400000000000000000000000101426211343400254210ustar00rootroot00000000000000ساعتlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/minute.word000066400000000000000000000000121426211343400255640ustar00rootroot00000000000000دقیقهlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/minutes.word000066400000000000000000000000121426211343400257470ustar00rootroot00000000000000دقیقهlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/or.word000066400000000000000000000000041426211343400247040ustar00rootroot00000000000000یاlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/second.word000066400000000000000000000000121426211343400255360ustar00rootroot00000000000000ثانیهlingua-franca-release-v0.4.3/lingua_franca/res/text/fa-ir/seconds.word000066400000000000000000000000121426211343400257210ustar00rootroot00000000000000ثانیهlingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/000077500000000000000000000000001426211343400234135ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/date_time.json000066400000000000000000000074611426211343400262510ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^\\d0$", "format": "{x0}"}, "3": {"match": "^[2-6]1$", "format": "{x0}-et-un"}, "4": {"match": "^[2-6|8]\\d$", "format": "{x0}-{x}"}, "5": {"match": "^\\d{2}$", "format": "{xx}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{1,2}$", "format": "{formatted_decade}"}, "2": {"match": "^100$", "format": "cent"}, "3": {"match": "^\\d00$", "format": "{x_in_x00}-cents"}, "4": {"match": "^1\\d{2}$", "format": "cent-{formatted_decade}"}, "5": {"match": "^\\d{3}$", "format": "{x_in_x00}-cent-{formatted_decade}"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^1000$", "format": "mille"}, "2": {"match": "^\\d000$", "format": "{x_in_x000}-mille"}, "3": {"match": "^1\\d{3}$", "format": "mille-{formatted_hundreds}"}, "4": {"match": "^\\d{4}$", "format": "{x_in_x000}-mille-{formatted_hundreds}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^[1-9]\\d{3}$", "format": "{formatted_thousand} {bc}"}, "default": "{year} {bc}", "bc": "avant Jésus Christ " }, "date_format": { "date_full": "{weekday} {day} {month} {formatted_year}", "date_full_no_year": "{weekday} {day} {month}", "date_full_no_year_month": "{weekday} {day}", "today": "aujourd'hui", "tomorrow": "demain", "yesterday": "hier" }, "date_time_format": { "date_time": "{formatted_date} {formatted_time}" }, "weekday": { "0": "lundi", "1": "mardi", "2": "mercredi", "3": "jeudi", "4": "vendredi", "5": "samedi", "6": "dimanche" }, "date": { "1": "premier", "2": "deux", "3": "trois", "4": "quatre", "5": "cinq", "6": "six", "7": "sept", "8": "huit", "9": "neuf", "10": "dix", "11": "onze", "12": "douze", "13": "treize", "14": "quatorze", "15": "quinze", "16": "seize", "17": "dix-sept", "18": "dix-huit", "19": "dix-neuf", "20": "vingt", "21": "vingt-et-un", "22": "vingt-deux", "23": "vingt-trois", "24": "vingt-quatre", "25": "vingt-cinq", "26": "vingt-six", "27": "vingt-sept", "28": "vingt-huit", "29": "vingt-neuf", "30": "trente", "31": "trente-et-un" }, "month": { "1": "janvier", "2": "février", "3": "mars", "4": "avril", "5": "mai", "6": "juin", "7": "juillet", "8": "août", "9": "septembre", "10": "octobe", "11": "novembre", "12": "décembre" }, "number": { "0": "zéro", "1": "un", "2": "deux", "3": "trois", "4": "quatre", "5": "cinq", "6": "six", "7": "sept", "8": "huit", "9": "neuf", "10": "dix", "11": "onze", "12": "douze", "13": "treize", "14": "quatorze", "15": "quinze", "16": "seize", "17": "dix-sept", "18": "dix-huit", "19": "dix-neuf", "20": "vingt", "30": "trente", "40": "quarante", "50": "cinquante", "60": "soixante", "70": "soixante-dix", "71": "soixante-et-onze", "72": "soixante-douze", "73": "soixante-treize", "74": "soixante-quatorze", "75": "soixante-quinze", "76": "soixante-seize", "77": "soixante-dix-sept", "78": "soixante-dix-huit", "79": "soixante-dix-neuf", "80": "quatre-vingt", "90": "quatre-vingt-dix", "91": "quatre-vingt-onze", "92": "quatre-vingt-douze", "93": "quatre-vingt-treize", "94": "quatre-vingt-quatorze", "95": "quatre-vingt-quinze", "96": "quatre-vingt-seize", "97": "quatre-vingt-dix-sept", "98": "quatre-vingt-dix-huit", "99": "quatre-vingt-dix-neuf" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/date_time_test.json000066400000000000000000000102501426211343400272760ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "un avant Jésus Christ" }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dix avant Jésus Christ" }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "quatre-vingt-douze avant Jésus Christ" }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "huit-cent-trois" }, "5": {"datetime_param": "111, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "cent-onze" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quatre-cent-cinquante-quatre" }, "7": {"datetime_param": "2005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cinq" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-douze" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-quarante-six" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-huit-cent-sept" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-sept-cent-dix-sept" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille-neuf-cent-quatre-vingt-huit"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-neuf"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-dix-huit"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-vingt-et-un"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille-trente"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cent" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "deux-mille" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trois-mille-cent-vingt avant Jésus Christ" }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trois-mille-deux-cent-quarante-et-un avant Jésus Christ" }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinq-mille-deux-cents" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille-cent" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "deux-mille-cent" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "dimanche quatre février deux-mille-dix-huit"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "dimanche quatre février"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "dimanche quatre"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "demain"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "aujourd'hui"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "hier"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "dimanche quatre février"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "dimanche quatre février deux-mille-dix-huit"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept une heure vingt-deux de l'après-midi"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "mardi trente-et-un janvier deux-mille-dix-sept treize heures vingt-deux"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/day.word000066400000000000000000000000051426211343400250600ustar00rootroot00000000000000jour lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/days.word000066400000000000000000000000061426211343400252440ustar00rootroot00000000000000jours lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/hour.word000066400000000000000000000000061426211343400252610ustar00rootroot00000000000000heure lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/hours.word000066400000000000000000000000071426211343400254450ustar00rootroot00000000000000heures lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/minute.word000066400000000000000000000000071426211343400256060ustar00rootroot00000000000000minute lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/minutes.word000066400000000000000000000000101426211343400257630ustar00rootroot00000000000000minutes lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/second.word000066400000000000000000000000101426211343400255520ustar00rootroot00000000000000seconde lingua-franca-release-v0.4.3/lingua_franca/res/text/fr-fr/seconds.word000066400000000000000000000000111426211343400257360ustar00rootroot00000000000000secondes lingua-franca-release-v0.4.3/lingua_franca/res/text/hu-hu/000077500000000000000000000000001426211343400234255ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/hu-hu/date_time.json000066400000000000000000000074641426211343400262660ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^20$", "format": "húsz"}, "4": {"match": "^\\d0$", "format": "{x0}"}, "5": {"match": "^[2-9]\\d$", "format": "{x0}{x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^2\\d{2}$", "format": "kétszáz"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00}száz"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^1\\d{3}$", "format": "ezer"}, "2": {"match": "^2\\d{3}$", "format": "kétezer"}, "3": {"match": "^\\d{4}$", "format": "{x_in_x000}ezer"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{bc} {formatted_decade}"}, "2": {"match": "^\\d000$", "format": "{bc} {formatted_thousand}"}, "3": {"match": "^\\d{3}$", "format": "{bc} {formatted_hundreds}{formatted_decade}"}, "4": {"match": "^[2-9]\\d00$", "format": "{bc} {formatted_thousand}-{formatted_hundreds}"}, "5": {"match": "^1[0-9]00$", "format": "{bc} {formatted_thousand}{formatted_hundreds}"}, "6": {"match": "^[2-9]0\\d{2}$", "format": "{bc} {formatted_thousand}-{formatted_decade}"}, "7": {"match": "^10\\d{2}$", "format": "{bc} {formatted_thousand}{formatted_decade}"}, "8": {"match": "^[2-9]00\\d$", "format": "{bc} {formatted_thousand}{formatted_decade}"}, "9": {"match": "^1\\d{3}$", "format": "{bc} {formatted_thousand}{formatted_hundreds}{formatted_decade}"}, "10": {"match": "^[2-9]\\d{3}$", "format": "{bc} {formatted_thousand}-{formatted_hundreds}{formatted_decade}"}, "default": "{bc} {year}", "bc": "kr.e." }, "date_format": { "date_full": "{formatted_year} {month} {day}, {weekday}", "date_full_no_year": "{month} {day}, {weekday}", "date_full_no_year_month": "{day}, {weekday}", "today": "ma", "tomorrow": "holnap", "yesterday": "tegnap" }, "date_time_format": { "date_time": "{formatted_date}, {formatted_time}" }, "weekday": { "0": "hétfő", "1": "kedd", "2": "szerda", "3": "csütörtök", "4": "péntek", "5": "szombat", "6": "vasárnap" }, "date": { "1": "elseje", "2": "másodika", "3": "harmadika", "4": "negyedike", "5": "ötödike", "6": "hatodika", "7": "hetedike", "8": "nyolcadika", "9": "kilencedike", "10": "tizedike", "11": "tizenegyedike", "12": "tizenkettedike", "13": "tizenharmadika", "14": "tizennegyedike", "15": "tizenötödike", "16": "tizenhatodika", "17": "tizenhetedike", "18": "tizennyolcadika", "19": "tizenkilencedike", "20": "huszadika", "21": "huszonegyedike", "22": "huszonkettedike", "23": "huszonharmadika", "24": "huszonnegyedike", "25": "huszonötödike", "26": "huszonhatodika", "27": "huszonhetedike", "28": "huszonnyolcadika", "29": "huszonkilencedike", "30": "harmincadika", "31": "harmincegyedike" }, "month": { "1": "január", "2": "február", "3": "március", "4": "április", "5": "május", "6": "június", "7": "július", "8": "augusztus", "9": "szeptember", "10": "október", "11": "november", "12": "december" }, "number": { "0": "nulla", "1": "egy", "2": "kettő", "3": "három", "4": "négy", "5": "öt", "6": "hat", "7": "hét", "8": "nyolc", "9": "kilenc", "10": "tíz", "11": "tizenegy", "12": "tizenkettő", "13": "tizenhárom", "14": "tizennégy", "15": "tizenöt", "16": "tizenhat", "17": "tizenhét", "18": "tizennyolc", "19": "tizenkilenc", "20": "huszon", "30": "harminc", "40": "negyven", "50": "ötven", "60": "hatvan", "70": "hetven", "80": "nyolcvan", "90": "kilencven" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/hu-hu/date_time_test.json000066400000000000000000000101421426211343400273100ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. egy" }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. tíz" }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. kilencvenkettő" }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nyolcszázhárom" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nyolcszáztizenegy" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "négyszázötvennégy" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezeröt" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezertizenkettő" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezernegyvenhat" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezernyolcszázhét" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezerhétszáztizenhét" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezerkilencszáznyolcvannyolc"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-kilenc"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-tizennyolc"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-huszonegy"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer-harminc"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "kétezer-egyszáz" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ezer" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "kétezer" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. háromezer-egyszázhúsz" }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "kr.e. háromezer-kétszáznegyvenegy" }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ötezer-kétszáz" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ezeregyszáz" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "kétezer-egyszáz" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "kétezer-tizennyolc február negyedike, vasárnap"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "február negyedike, vasárnap"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "negyedike, vasárnap"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "holnap"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "ma"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "tegnap"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "február negyedike, vasárnap"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "kétezer-tizennyolc február negyedike, vasárnap"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd, délután egy óra huszonkettő"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "kétezer-tizenhét január harmincegyedike, kedd, tizenhárom óra huszonkettő"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/000077500000000000000000000000001426211343400234255ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/date_time.json000066400000000000000000000104501426211343400262530ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9][1|8]", "format": "{xx}"}, "5": {"match": "^[2-9]\\d$", "format": "{x0}{x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "cento"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00}cento"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^1\\d{3}$", "format": "mille"}, "2": {"match": "^\\d{4}$", "format": "{x_in_x000}mila"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d000$", "format": "{formatted_thousand} {bc}"}, "4": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "5": {"match": "^1[0-9]00$", "format": "{formatted_thousand} {formatted_hundreds} {bc}"}, "6": {"match": "^10\\d{2}$", "format": "{formatted_thousand} e {formatted_decade} {bc}"}, "7": {"match": "^[2-9][0-9]00$", "format": "{formatted_thousand} {formatted_hundreds} {bc}"}, "8": {"match": "^20\\d{2}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "9": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, "10": {"match": "^[2-9]000$", "format": "{formatted_thousand} {bc}"}, "11": {"match": "^20\\d{2}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "12": {"match": "^([2-9]\\d{3})|(\\d0\\d{2})$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, "13": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_hundreds} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "a.C." }, "date_format": { "date_full": "{weekday}, {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "oggi", "tomorrow": "domani", "yesterday": "ieri" }, "date_time_format": { "date_time": "{formatted_date} alle {formatted_time}" }, "weekday": { "0": "lunedì", "1": "martedì", "2": "mercoledì", "3": "giovedì", "4": "venerdì", "5": "sabato", "6": "domenica" }, "date": { "1": "primo", "2": "due", "3": "tre", "4": "quattro", "5": "cinque", "6": "sei", "7": "sette", "8": "otto", "9": "nove", "10": "dieci", "11": "undici", "12": "dodici", "13": "tredici", "14": "quattordici", "15": "quindici", "16": "sedici", "17": "diciassette", "18": "diciotto", "19": "diciannove", "20": "venti", "21": "ventuno", "22": "ventidue", "23": "ventitre", "24": "ventiquattro", "25": "venticinque", "26": "ventisei", "27": "ventisette", "28": "ventotto", "29": "ventinove", "30": "trenta", "31": "trentuno" }, "month": { "1": "gennaio", "2": "febbraio", "3": "marzo", "4": "aprile", "5": "maggio", "6": "giugno", "7": "luglio", "8": "agosto", "9": "settembre", "10": "ottobre", "11": "novembre", "12": "dicembre" }, "number": { "0": "zero", "1": "uno", "2": "due", "3": "tre", "4": "quattro", "5": "cinque", "6": "sei", "7": "sette", "8": "otto", "9": "nove", "10": "dieci", "11": "undici", "12": "dodici", "13": "tredici", "14": "quattordici", "15": "quindici", "16": "sedici", "17": "diciassette", "18": "diciotto", "19": "diciannove", "20": "venti", "21": "ventuno", "28": "ventotto", "30": "trenta", "31": "trentuno", "38": "trentotto", "40": "quaranta", "41": "quarantuno", "48": "quarantotto", "50": "cinquanta", "51": "cinquantuno", "58": "cinquantotto", "60": "sessanta", "61": "sessantuno", "68": "sessantotto", "70": "settanta", "71": "settantuno", "78": "settantotto", "80": "ottanta", "81": "ottantuno", "88": "ottantotto", "90": "novanta", "91": "novantuno", "98": "novantotto", "100": "cento", "1000": "mille", "2000": "duemila" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/date_time_test.json000066400000000000000000000100611426211343400273100ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "uno a.C." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dieci a.C." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "novantadue a.C." }, "4": {"datetime_param": "100, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "cento" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ottocento undici" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "quattrocento cinquantaquattro" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e cinque" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e dodici" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille e quarantasei" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille ottocento sette" }, "11": {"datetime_param": "1700, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille settecento" }, "12": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille settecento diciassette" }, "13": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille novecento ottantotto"}, "14": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila nove"}, "15": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila diciotto"}, "16": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila ventuno"}, "17": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila trenta"}, "18": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duemila cento" }, "19": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mille" }, "20": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duemila" }, "21": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tremila cento venti a.C." }, "22": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tremila duecento quarantuno a.C." }, "23": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinquemila duecento" }, "24": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mille cento" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "martedì, trentuno gennaio, duemila diciassette"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "domenica, quattro febbraio, duemila diciotto"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "domenica, quattro febbraio"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "domenica, quattro"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "domani"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "oggi"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "ieri"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "domenica, quattro febbraio"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "domenica, quattro febbraio, duemila diciotto"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "martedì, trentuno gennaio, duemila diciassette alle una e ventidue del pomeriggio"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "martedì, trentuno gennaio, duemila diciassette alle tredici e ventidue"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/day.word000066400000000000000000000000071426211343400250740ustar00rootroot00000000000000giorno lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/days.word000066400000000000000000000000071426211343400252570ustar00rootroot00000000000000giorni lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/hour.word000066400000000000000000000000041426211343400252710ustar00rootroot00000000000000ora lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/hours.word000066400000000000000000000000041426211343400254540ustar00rootroot00000000000000ore lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/minute.word000066400000000000000000000000071426211343400256200ustar00rootroot00000000000000minuto lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/minutes.word000066400000000000000000000000071426211343400260030ustar00rootroot00000000000000minuti lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/second.word000066400000000000000000000000101426211343400255640ustar00rootroot00000000000000secondo lingua-franca-release-v0.4.3/lingua_franca/res/text/it-it/seconds.word000066400000000000000000000000101426211343400257470ustar00rootroot00000000000000secondi lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/000077500000000000000000000000001426211343400234175ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/date_time.json000066400000000000000000000073611426211343400262540ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x} en {x0}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "honderd"}, "2": {"match": "^\\d{3}$", "format": "{x_in_x00} honderd"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^10\\d\\d$", "format": "duizend"}, "2": {"match": "^\\d0\\d{2}$", "format": "{x_in_x000} duizend"}, "3": {"match": "^1\\d00$", "format": "{xx_in_xx00} honderd"}, "4": {"match": "^\\d{2}00$", "format": "{x_in_x00} en {x0_in_x000} honderd"}, "5": {"match": "^\\d0\\d\\d$", "format": "{x_in_x000} duizend"}, "6": {"match": "^1\\d{3}$", "format": "{xx_in_xx00}"}, "7": {"match": "^\\d{4}$", "format": "{x_in_0x00} en {x0_in_x000}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d{1}?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d{2}?$", "format": "{formatted_decade} {bc}"}, "3": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "4": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "5": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "6": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "7": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "8": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "v.c." }, "date_format": { "date_full": "{weekday}, {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day} {month}", "today": "vandaag", "tomorrow": "morgen", "yesterday": "gisteren" }, "time_format": { "time_full": "{minutes} over {hour}" }, "date_time_format": { "date_time": "{formatted_date} om {formatted_time}" }, "weekday": { "0": "maandag", "1": "dinsdag", "2": "woensdag", "3": "donderdag", "4": "vrijdag", "5": "zaterdag", "6": "zondag" }, "date": { "1": "een", "2": "twee", "3": "drie", "4": "vier", "5": "vijf", "6": "zes", "7": "zeven", "8": "acht", "9": "negen", "10": "tien", "11": "elf", "12": "twaalf", "13": "dertien", "14": "veertien", "15": "vijtien", "16": "zestien", "17": "zeventien", "18": "achttien", "19": "negentien", "20": "twintig", "21": "eenentwintig", "22": "tweeentwintig", "23": "drieentwintig", "24": "vierentwintig", "25": "vijfentwintig", "26": "zesentwintig", "27": "zevenentwintig", "28": "achtentwintig", "29": "negenentwintig", "30": "dertig", "31": "eenendertig" }, "month": { "1": "januari", "2": "februari", "3": "maart", "4": "april", "5": "mei", "6": "juni", "7": "juli", "8": "augustus", "9": "september", "10": "oktober", "11": "november", "12": "december" }, "number": { "0": "nul", "1": "een", "2": "twee", "3": "drie", "4": "vier", "5": "vijf", "6": "zes", "7": "zeven", "8": "acht", "9": "negen", "10": "tien", "11": "elf", "12": "twaalf", "13": "dertien", "14": "veertien", "15": "vijtien", "16": "zestien", "17": "zeventien", "18": "achttien", "19": "negentien", "20": "twintig", "30": "dertig", "40": "veertig", "50": "vijftig", "60": "zestig", "70": "zeventig", "80": "tachtig", "90": "negentig" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/date_time_test.json000066400000000000000000000102071426211343400273040ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "een v.c." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tien v.c." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "twee en negentig v.c." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht honderd drie" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht honderd elf" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vier honderd vier en vijftig" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend vijf" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend twaalf" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "duizend zes en veertig" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "achttien zeven" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zeventien zeventien" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "negentien acht en tachtig"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend negen"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend achttien"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend een en twintig"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend dertig"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "een en twintig honderd" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "duizend" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "twee duizend" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "een en dertig twintig v.c." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "twee en dertig een en veertig v.c." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "twee en vijftig honderd" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elf honderd" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "een en twintig honderd" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "zondag, vier februari, twee duizend achttien"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "zondag, vier februari"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "zondag, vier februari"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "morgen"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "vandaag"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "gisteren"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "zondag, vier februari"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "zondag, vier februari, twee duizend achttien"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien om tweeentwintig over één 's middags"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "dinsdag, eenendertig januari, twee duizend zeventien om dertien uur tweeentwintig"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/day.word000066400000000000000000000000041426211343400250630ustar00rootroot00000000000000dag lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/days.word000066400000000000000000000000061426211343400252500ustar00rootroot00000000000000dagen lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/hour.word000066400000000000000000000000041426211343400252630ustar00rootroot00000000000000uur lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/hours.word000066400000000000000000000000051426211343400254470ustar00rootroot00000000000000uren lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/minute.word000066400000000000000000000000071426211343400256120ustar00rootroot00000000000000minuut lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/minutes.word000066400000000000000000000000101426211343400257670ustar00rootroot00000000000000minuten lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/second.word000066400000000000000000000000101426211343400255560ustar00rootroot00000000000000seconde lingua-franca-release-v0.4.3/lingua_franca/res/text/nl-nl/seconds.word000066400000000000000000000000111426211343400257420ustar00rootroot00000000000000seconden lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/000077500000000000000000000000001426211343400234235ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/and.word000066400000000000000000000000071426211343400250570ustar00rootroot00000000000000i oraz lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/date_time.json000066400000000000000000000072101426211343400262510ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tysiąc"}, "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundred"}, "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundred"}, "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} oh {formatted_decade} {bc}"}, "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "p.n.e." }, "date_format": { "date_full": "{weekday}, {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "dziś", "tomorrow": "jutro", "yesterday": "wczoraj" }, "date_time_format": { "date_time": "{formatted_date} at {formatted_time}" }, "weekday": { "0": "poniedziałek", "1": "wtorek", "2": "środa", "3": "czwartek", "4": "piątek", "5": "sobota", "6": "niedziela" }, "date": { "1": "pierwszy", "2": "drugi", "3": "trzeci", "4": "czwarty", "5": "piąty", "6": "szósty", "7": "siódmy", "8": "ósmy", "9": "dziewiąty", "10": "dziesiąty", "11": "jedenast", "12": "dwunasty", "13": "trzynasty", "14": "czternasty", "15": "piętnasty", "16": "szesnasty", "17": "siedemnasty", "18": "osiemnasty", "19": "dziewiętnasty", "20": "dwudziesty", "21": "dwudziesty pierwszy", "22": "dwudziesty drugi", "23": "dwudziesty trzeci", "24": "dwudziesty czwarty", "25": "dwudziesty piąty", "26": "dwudziesty szósty", "27": "dwudziesty siódmy", "28": "dwudziesty ósmy", "29": "dwudziesty dziewiąty", "30": "trzydziesty", "31": "trzydziesty pierwszy" }, "month": { "1": "styczeń", "2": "luty", "3": "marzec", "4": "kwiecień", "5": "maj", "6": "czerwiec", "7": "lipiec", "8": "sierpień", "9": "wrzesień", "10": "październik", "11": "listopad", "12": "grudzień" }, "number": { "0": "zero", "1": "jeden", "2": "dwa", "3": "trzy", "4": "cztery", "5": "pięć", "6": "sześć", "7": "siedem", "8": "osiem", "9": "dziewięc", "10": "dziesięć", "11": "jedenaście", "12": "dwanaście", "13": "trzynaście", "14": "czternaście", "15": "piętnaście", "16": "szesnaście", "17": "siedemnaście", "18": "osiemnaście", "19": "dziewiętnaście", "20": "dwadzieścia", "30": "trzydzieści", "40": "czterdzieści", "50": "pięćdziesiąt", "60": "sześćdziesiąt", "70": "siedemdziesiąt", "80": "osiemdziesiąt", "90": "dziewięćdziesiąt" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/day.word000066400000000000000000000000071426211343400250720ustar00rootroot00000000000000dzień lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/days.word000066400000000000000000000000041426211343400252520ustar00rootroot00000000000000dni lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/hour.word000066400000000000000000000000101426211343400252640ustar00rootroot00000000000000godzina lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/hours.word000066400000000000000000000000071426211343400254550ustar00rootroot00000000000000godzin lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/minute.word000066400000000000000000000000071426211343400256160ustar00rootroot00000000000000minuta lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/minutes.word000066400000000000000000000000061426211343400260000ustar00rootroot00000000000000minut lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/or.word000066400000000000000000000000111426211343400247300ustar00rootroot00000000000000lub albo lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/second.word000066400000000000000000000000101426211343400255620ustar00rootroot00000000000000sekunda lingua-franca-release-v0.4.3/lingua_franca/res/text/pl-pl/seconds.word000066400000000000000000000000071426211343400257530ustar00rootroot00000000000000sekund lingua-franca-release-v0.4.3/lingua_franca/res/text/pt-pt/000077500000000000000000000000001426211343400234435ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/pt-pt/normalize.json000066400000000000000000000033271426211343400263430ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": false, "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": true, "contractions": {}, "word_replacements": {}, "number_replacements": { "catorze": "14", "cem": "100", "cento": "100", "cinco": "5", "cinquenta": "50", "dez": "10", "dezanove": "19", "dezasseis": "16", "dezassete": "17", "dezoito": "18", "dois": "2", "doze": "12", "duas": "2", "duzentas": "200", "duzentos": "200", "mil": "1000", "milhão": "1000000", "nove": "9", "novecentas": "900", "novecentos": "900", "noventa": "90", "oitenta": "80", "oito": "8", "oitocentas": "800", "oitocentos": "800", "onze": "11", "primeiro": "1", "quarenta": "40", "quatro": "4", "quatrocentas": "400", "quatrocentos": "400", "quinhentas": "500", "quinhentos": "500", "quinze": "15", "segundo": "2", "seis": "6", "seiscentas": "600", "seiscentos": "600", "sessenta": "60", "sete": "7", "setecentas": "700", "setecentos": "700", "setenta": "70", "terceiro": "3", "tres": "3", "treze": "13", "trezentas": "300", "trezentos": "300", "trinta": "30", "três": "3", "um": "1", "uma": "1", "vinte": "20", "zero": "0" }, "stopwords": [ "de", "dos", "das", "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", "este", "esta", "deste", "desta", "neste", "nesta", "nesse", "nessa", "foi", "que" ], "articles": [ "o", "a", "os", "as" ] }lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/000077500000000000000000000000001426211343400234515ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/date_time.json000066400000000000000000000130271426211343400263020ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^1\\d{2}$", "format": "сто"}, "2": {"match": "^2\\d{2}$", "format": "двести"}, "3": {"match": "^[34]\\d{2}$", "format": "{x_in_x00}ста"}, "4": {"match": "^\\d{3}$", "format": "{x_in_x00}сот"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^10\\d{2}$", "format": "тысяча"}, "2": {"match": "^11\\d{2}$", "format": "тысяча сто"}, "3": {"match": "^12\\d{2}$", "format": "тысяча двести"}, "4": {"match": "^1[34]\\d{2}$", "format": "тысяча {x_in_x00}ста"}, "5": {"match": "^1\\d{3}$", "format": "тысяча {x_in_x00}сот"}, "6": {"match": "^20\\d{2}$", "format": "две тысячи"}, "7": {"match": "^21\\d{2}$", "format": "две тысячи сто"}, "8": {"match": "^22\\d{2}$", "format": "две тысячи двести"}, "9": {"match": "^2[34]\\d{2}$", "format": "две тысячи {x_in_x00}ста"}, "10": {"match": "^2\\d{3}$", "format": "две тысячи {x_in_x00}сот"}, "11": {"match": "^[34]0\\d{2}$", "format": "{x_in_x000} тысячи"}, "12": {"match": "^[34]1\\d{2}$", "format": "{x_in_x000} тысячи сто"}, "13": {"match": "^[34]2\\d{2}$", "format": "{x_in_x000} тысячи двести"}, "14": {"match": "^[34][34]\\d{2}$", "format": "{x_in_x000} тысячи {x_in_x00}ста"}, "15": {"match": "^[34]\\d{3}$", "format": "{x_in_x000} тысячи {x_in_x00}сот"}, "16": {"match": "^[5-9]0\\d{2}$", "format": "{x_in_x000} тысяч"}, "17": {"match": "^[5-9]1\\d{2}$", "format": "{x_in_x000} тысяч сто"}, "18": {"match": "^[5-9]2\\d{2}$", "format": "{x_in_x000} тысяч двести"}, "19": {"match": "^[5-9][34]\\d{2}$", "format": "{x_in_x000} тысяч {x_in_x00}ста"}, "20": {"match": "^[5-9]\\d{3}$", "format": "{x_in_x000} тысяч {x_in_x00}сот"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "до нашей эры" }, "date_format": { "date_full": "{weekday}, {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "сегодня", "tomorrow": "завтра", "yesterday": "вчера" }, "date_time_format": { "date_time": "{formatted_date} в {formatted_time}" }, "weekday": { "0": "в понедельник", "1": "во вторник", "2": "в среду", "3": "в четверг", "4": "в пятницу", "5": "в субботу", "6": "в воскресенье" }, "date": { "1": "первого", "2": "второго", "3": "третьего", "4": "четвёртого", "5": "пятого", "6": "шестого", "7": "седьмого", "8": "восьмого", "9": "девятого", "10": "десятого", "11": "одиннадцатого", "12": "двенадцатого", "13": "тринадцатого", "14": "четырнадцатого", "15": "пятнадцатого", "16": "шестнадцатого", "17": "семнадцатого", "18": "восемнадцатого", "19": "девятнадцатого", "20": "двадцатого", "21": "двадцать первого", "22": "двадцать второго", "23": "двадцать третьего", "24": "двадцать четвёртого", "25": "двадцать пятого", "26": "двадцать шестого", "27": "двадцать седьмого", "28": "двадцать восьмого", "29": "двадцать девятого", "30": "тридцатого", "31": "тридцать первого" }, "month": { "1": "января", "2": "февраля", "3": "марта", "4": "апреля", "5": "мая", "6": "июня", "7": "июля", "8": "августа", "9": "сентября", "10": "октября", "11": "ноября", "12": "декабря" }, "number": { "0": "ноль", "1": "один", "2": "два", "3": "три", "4": "четыре", "5": "пять", "6": "шесть", "7": "семь", "8": "восемь", "9": "девять", "10": "десять", "11": "одиннадцать", "12": "двенадцать", "13": "тринадцать", "14": "четырнадцать", "15": "пятнадцать", "16": "шестнадцать", "17": "семнадцать", "18": "восемнадцать", "19": "девятнадцать", "20": "двадцать", "30": "тридцать", "40": "сорок", "50": "пятьдесят", "60": "шестьдесят", "70": "семьдесят", "80": "восемьдесят", "90": "девяносто" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/date_time_test.json000066400000000000000000000120061426211343400273350ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "один до нашей эры" }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "десять до нашей эры" }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "девяносто два до нашей эры" }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "восемьсот три" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "восемьсот одиннадцать" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "четыреста пятьдесят четыре" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча пять" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча двенадцать" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча сорок шесть" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча восемьсот семь" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча семьсот семнадцать" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча девятьсот восемьдесят восемь"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи девять"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи восемнадцать"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи двадцать один"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи тридцать"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "две тысячи сто" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "тысяча" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "две тысячи" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "три тысячи сто двадцать до нашей эры" }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "три тысячи двести сорок один до нашей эры" }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "пять тысяч двести" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "тысяча сто" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "две тысячи сто" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля, две тысячи восемнадцать"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "завтра"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "сегодня"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "вчера"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "в воскресенье, четвёртого февраля, две тысячи восемнадцать"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать в час двадцать два дня"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "во вторник, тридцать первого января, две тысячи семнадцать в тринадцать двадцать два"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/day.word000066400000000000000000000000111426211343400251130ustar00rootroot00000000000000день lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/days.word000066400000000000000000000000111426211343400252760ustar00rootroot00000000000000дней lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/hour.word000066400000000000000000000000071426211343400253200ustar00rootroot00000000000000час lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/hours.word000066400000000000000000000000131426211343400255000ustar00rootroot00000000000000часов lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/minute.word000066400000000000000000000000151426211343400256430ustar00rootroot00000000000000минута lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/minutes.word000066400000000000000000000000131426211343400260240ustar00rootroot00000000000000минут lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/normalize.json000066400000000000000000000022471426211343400263510ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, "remove_symbols": false, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, "contractions": {}, "word_replacements": {}, "number_replacements": { "ноль": "0", "нуль": "0", "один": "1", "одна": "1", "два": "2", "две": "2", "три": "3", "четыре": "4", "пять": "5", "шесть": "6", "семь": "7", "восемь": "8", "девять": "9", "десять": "10", "одиннадцать": "11", "двенадцать": "12", "тринадцать": "13", "четырнадцать": "14", "пятнадцать": "15", "шестнадцать": "16", "семнадцать": "17", "восемнадцать": "18", "девятнадцать": "19", "двадцать": "20", "тридцать": "30", "сорок": "40", "пятьдесят": "50", "шестьдесят": "60", "семьдесят": "70", "восемьдесят": "80", "девяносто": "90" }, "stopwords": [], "articles": [] }lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/second.word000066400000000000000000000000171426211343400256170ustar00rootroot00000000000000секунда lingua-franca-release-v0.4.3/lingua_franca/res/text/ru-ru/seconds.word000066400000000000000000000000151426211343400260000ustar00rootroot00000000000000секунд lingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/000077500000000000000000000000001426211343400234265ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/and.word000066400000000000000000000000021426211343400250550ustar00rootroot00000000000000inlingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/date_time.json000066400000000000000000000060001426211343400262500ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^0$", "format": ""}, "2": {"match": "^\\d$", "format": "{x}"}, "3": {"match": "^1\\d$", "format": "{xx}"}, "4": {"match": "^\\d0$", "format": "{x0}"}, "5": {"match": "^[2-9]\\d$", "format": "{x}in{x0}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{1,2}$", "format": "{formatted_decade}"}, "2": {"match": "^1\\d{2}$", "format": "sto {formatted_decade}"}, "3": {"match": "^2\\d{2}$", "format": "dvesto {formatted_decade}"}, "4": {"match": "^\\d{3}$", "format": "{x_in_x00}sto {formatted_decade}"}, "default": "{formatted_decade}" }, "thousand_format": { "1": {"match": "^\\d{1,3}$", "format": "{formatted_hundreds}"}, "2": {"match": "^1\\d{3}$", "format": "tisoč {formatted_hundreds}"}, "default": "{x_in_x000} tisoč {formatted_hundreds}" }, "year_format": { "default": "{formatted_thousand} {bc}", "bc": "pr. n. št." }, "date_format": { "date_full": "{weekday}, {day} {month} {formatted_year}", "date_full_no_year": "{weekday}, {day} {month}", "date_full_no_year_month": "{weekday}, {day}", "today": "danes", "tomorrow": "jutri", "yesterday": "včeraj" }, "date_time_format": { "date_time": "{formatted_date}, ob {formatted_time}" }, "weekday": { "0": "ponedeljek", "1": "torek", "2": "sreda", "3": "četrtek", "4": "petek", "5": "sobota", "6": "nedelja" }, "date": { "1": "prvi", "2": "drugi", "3": "tretji", "4": "četrti", "5": "peti", "6": "šesti", "7": "sedmi", "8": "osmi", "9": "deveti", "10": "deseti", "11": "enajsti", "12": "dvanajsti", "13": "trinajsti", "14": "štirinajsti", "15": "petjanjsti", "16": "šestnajsti", "17": "sedemnajsti", "18": "osemnajsti", "19": "devetnajsti", "20": "dvajseti", "21": "enaindvajseti", "22": "dvaindvajseti", "23": "triindvajseti", "24": "štiriindvajseti", "25": "petindvajseti", "26": "šestindvajseti", "27": "sedemindvajseti", "28": "osemindvajseti", "29": "devetindvajseti", "30": "trideseti", "31": "enaintrideseti" }, "month": { "1": "januar", "2": "februar", "3": "marec", "4": "april", "5": "maj", "6": "junij", "7": "julij", "8": "avgust", "9": "september", "10": "oktober", "11": "november", "12": "december" }, "number": { "0": "nič", "1": "ena", "2": "dva", "3": "tri", "4": "štiri", "5": "pet", "6": "šest", "7": "sedem", "8": "osem", "9": "devet", "10": "deset", "11": "enajst", "12": "dvanajst", "13": "trinajst", "14": "štirinajst", "15": "petnajst", "16": "šestnajst", "17": "sedemnajst", "18": "osemnajst", "19": "devetnajst", "20": "dvajset", "30": "trideset", "40": "štirideset", "50": "petdeset", "60": "šestdeset", "70": "sedemdeset", "80": "osemdeset", "90": "devetdeset" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/date_time_test.json000066400000000000000000000102101426211343400273050ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ena pr. n. št." }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "deset pr. n. št." }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dvaindevetdeset pr. n. št." }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osemsto tri" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "osemsto enajst" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "štiristo štiriinpetdeset" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč pet" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč dvanajst" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč šestinštirideset" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč osemsto sedem" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč sedemsto sedemnajst" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč devetsto oseminosemdeset"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč devet"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč osemnajst"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč enaindvajset"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč trideset"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dva tisoč sto" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tisoč" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dva tisoč" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tri tisoč sto dvajset pr. n. št." }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tri tisoč dvesto enainštirideset pr. n. št." }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "pet tisoč dvesto" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tisoč sto" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dva tisoč sto" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "nedelja, četrti februar dva tisoč osemnajst"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "nedelja, četrti februar"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "nedelja, četrti"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "jutri"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "danes"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "včeraj"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "nedelja, četrti februar"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "nedelja, četrti februar dva tisoč osemnajst"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst, ob dvaindvajset čez ena p.m."}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "torek, enaintrideseti januar dva tisoč sedemnajst, ob trinajst dvaindvajset"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/day.word000066400000000000000000000000031426211343400250710ustar00rootroot00000000000000danlingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/days.word000066400000000000000000000000031426211343400252540ustar00rootroot00000000000000dnilingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/hour.word000066400000000000000000000000031426211343400252710ustar00rootroot00000000000000uralingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/hours.word000066400000000000000000000000021426211343400254530ustar00rootroot00000000000000urlingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/minute.word000066400000000000000000000000061426211343400256200ustar00rootroot00000000000000minutalingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/minutes.word000066400000000000000000000000051426211343400260020ustar00rootroot00000000000000minutlingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/normalize.json000066400000000000000000000016161426211343400263250ustar00rootroot00000000000000{ "lowercase": false, "numbers_to_digits": true, "expand_contractions": false, "remove_symbols": false, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, "contractions": {}, "word_replacements": {}, "number_replacements": { "nič": "0", "ena": "1", "dve": "2", "dva": "2", "tri": "3", "štiri": "4", "pet": "5", "šest": "6", "sedem": "7", "osem": "8", "devet": "9", "deset": "10", "enajst": "11", "dvanajst": "12", "trinajst": "13", "štirinajst": "14", "petnajst": "15", "šestnajst": "16", "sedemnajst": "17", "osemnajst": "18", "devetnajst": "19", "dvajset": "20", "trideset": "30", "štirideset": "40", "petdeset": "50", "šestdeset": "60", "sedemdeset": "70", "osemdeset": "80", "devetdeset": "90" }, "stopwords": [], "articles": [] }lingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/or.word000066400000000000000000000000031426211343400247340ustar00rootroot00000000000000alilingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/second.word000066400000000000000000000000071426211343400255730ustar00rootroot00000000000000sekundalingua-franca-release-v0.4.3/lingua_franca/res/text/sl-si/seconds.word000066400000000000000000000000061426211343400257550ustar00rootroot00000000000000sekundlingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/000077500000000000000000000000001426211343400234345ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/date_time.json000066400000000000000000000066151426211343400262720ustar00rootroot00000000000000{ "decade_format": { "1": {"match": "^\\d$", "format": "{x}"}, "2": {"match": "^1\\d$", "format": "{xx}"}, "3": {"match": "^\\d0$", "format": "{x0}"}, "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundra"}, "default": "{number}" }, "thousand_format": { "1": {"match": "^\\d00\\d$", "format": "{x_in_x000} tusen"}, "2": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundra"}, "3": {"match": "^\\d{2}00$", "format": "{x0_in_x000} {x_in_x00} hundra"}, "4": {"match": "^(1\\d{3})|(\\d0\\d{2})$", "format": "{xx_in_xx00}"}, "5": {"match": "^\\d{4}$", "format": "{x0_in_x000} {x_in_x00}"}, "default": "{number}" }, "year_format": { "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, "5": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "6": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} noll {formatted_decade} {bc}"}, "7": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, "default": "{year} {bc}", "bc": "före kristus" }, "date_format": { "date_full": "{weekday}, den {day} {month}, {formatted_year}", "date_full_no_year": "{weekday}, den {day} {month}", "date_full_no_year_month": "{weekday}, den {day}", "today": "idag", "tomorrow": "imorgon", "yesterday": "igår" }, "date_time_format": { "date_time": "{formatted_date} klockan {formatted_time}" }, "weekday": { "0": "måndag", "1": "tisdag", "2": "onsdag", "3": "torsdag", "4": "fredag", "5": "lördag", "6": "söndag" }, "date": { "1": "första", "2": "andra", "3": "tredje", "4": "fjärde", "5": "femte", "6": "sjätte", "7": "sjunde", "8": "åttonde", "9": "nionde", "10": "tionde", "11": "elfte", "12": "tolfte", "13": "trettonde", "14": "fjortonde", "15": "femtonde", "16": "sextonde", "17": "sjuttonde", "18": "artonde", "19": "nittonde", "20": "tjugonde", "21": "tjugoförsta", "22": "tjugoandra", "23": "tjugotredje", "24": "tjugofjärde", "25": "tjugofemte", "26": "tjugosjätte", "27": "tjugosjunde", "28": "tjugoåttonde", "29": "tjugonionde", "30": "trettionde", "31": "trettiförsta" }, "month": { "1": "januari", "2": "februari", "3": "mars", "4": "april", "5": "maj", "6": "juni", "7": "juli", "8": "augusti", "9": "september", "10": "oktober", "11": "november", "12": "december" }, "number": { "0": "noll", "1": "ett", "2": "två", "3": "tre", "4": "fyra", "5": "fem", "6": "sex", "7": "sju", "8": "åtta", "9": "nio", "10": "tio", "11": "elva", "12": "tolv", "13": "tretton", "14": "fjorton", "15": "femton", "16": "sexton", "17": "sjutton", "18": "arton", "19": "nitton", "20": "tjugo", "30": "trettio", "40": "förtio", "50": "femtio", "60": "sextio", "70": "sjuttio", "80": "åttio", "90": "nittio" } } lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/date_time_test.json000066400000000000000000000101371426211343400273230ustar00rootroot00000000000000{ "test_nice_year": { "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ett före kristus" }, "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tio före kristus" }, "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "nittio två före kristus" }, "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "åtta hundra tre" }, "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "åtta hundra elva" }, "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "fyra hundra femtio fyra" }, "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ett tusen fem" }, "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tio tolv" }, "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tio förtio sex" }, "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "arton noll sju" }, "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "sjutton sjutton" }, "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "nitton åttio åtta"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "två tusen nio"}, "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo arton"}, "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo tjugo ett"}, "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tjugo trettio"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tjugo ett hundra" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ett tusen" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "två tusen" }, "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trettio ett tjugo före kristus" }, "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "trettio två förtio ett före kristus" }, "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "femtio två hundra" }, "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elva hundra" }, "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tjugo ett hundra" } }, "test_nice_date": { "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton"}, "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde februari, tjugo arton"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde februari"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "söndag, den fjärde"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "imorgon"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "idag"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "igår"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "söndag, den fjärde februari"}, "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "söndag, den fjärde februari, tjugo arton"} }, "test_nice_date_time": { "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton klockan tjugotvå minuter över ett på eftermiddagen"}, "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "tisdag, den trettiförsta januari, tjugo sjutton klockan tretton tjugotvå"} } } lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/day.word000066400000000000000000000000041426211343400251000ustar00rootroot00000000000000dag lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/days.word000066400000000000000000000000061426211343400252650ustar00rootroot00000000000000dagar lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/hour.word000066400000000000000000000000061426211343400253020ustar00rootroot00000000000000timme lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/hours.word000066400000000000000000000000071426211343400254660ustar00rootroot00000000000000timmar lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/minute.word000066400000000000000000000000061426211343400256260ustar00rootroot00000000000000minut lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/minutes.word000066400000000000000000000000101426211343400260040ustar00rootroot00000000000000minuter lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/second.word000066400000000000000000000000071426211343400256010ustar00rootroot00000000000000sekund lingua-franca-release-v0.4.3/lingua_franca/res/text/sv-se/seconds.word000066400000000000000000000000111426211343400257570ustar00rootroot00000000000000sekunder lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/000077500000000000000000000000001426211343400234475ustar00rootroot00000000000000lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/day.word000066400000000000000000000000051426211343400251140ustar00rootroot00000000000000gün lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/days.word000066400000000000000000000000101426211343400252730ustar00rootroot00000000000000günler lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/hour.word000066400000000000000000000000051426211343400253140ustar00rootroot00000000000000saat lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/hours.word000066400000000000000000000000101426211343400254730ustar00rootroot00000000000000saatler lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/minute.word000066400000000000000000000000071426211343400256420ustar00rootroot00000000000000dakika lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/minutes.word000066400000000000000000000000121426211343400260210ustar00rootroot00000000000000dakikalar lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/second.word000066400000000000000000000000071426211343400256140ustar00rootroot00000000000000saniye lingua-franca-release-v0.4.3/lingua_franca/res/text/tr-tr/seconds.word000066400000000000000000000000121426211343400257730ustar00rootroot00000000000000saniyeler lingua-franca-release-v0.4.3/lingua_franca/time.py000066400000000000000000000044161426211343400221370ustar00rootroot00000000000000# # Copyright 2018 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime from dateutil.tz import gettz, tzlocal __default_tz = None def set_default_tz(tz): global __default_tz if isinstance(tz, str): tz = gettz(tz) __default_tz = tz def default_timezone(): """ Get the default timezone either a value set by downstream user with lingua_franca.internal.set_default_tz or default system value Returns: (datetime.tzinfo): Definition of the default timezone """ return __default_tz or tzlocal() def now_utc(): """ Retrieve the current time in UTC Returns: (datetime): The current time in Universal Time, aka GMT """ return to_utc(datetime.utcnow()) def now_local(tz=None): """ Retrieve the current time Args: tz (datetime.tzinfo, optional): Timezone, default to user's settings Returns: (datetime): The current time """ if not tz: tz = default_timezone() return datetime.now(tz) def to_utc(dt): """ Convert a datetime with timezone info to a UTC datetime Args: dt (datetime): A datetime (presumably in some local zone) Returns: (datetime): time converted to UTC """ tzUTC = gettz("UTC") if dt.tzinfo: return dt.astimezone(tzUTC) else: return dt.replace(tzinfo=gettz("UTC")).astimezone(tzUTC) def to_local(dt): """ Convert a datetime to the user's local timezone Args: dt (datetime): A datetime (if no timezone, defaults to UTC) Returns: (datetime): time converted to the local timezone """ tz = default_timezone() if dt.tzinfo: return dt.astimezone(tz) else: return dt.replace(tzinfo=gettz("UTC")).astimezone(tz) lingua-franca-release-v0.4.3/pre-spinoff-credits.md000066400000000000000000000044731426211343400222460ustar00rootroot00000000000000# List of Original Contributors Lingua Franca started as a spinoff from Mycroft-core (not a fork.) Consequently, the source repository no longer lists those who contributed to this code before the spinoff. Below is an attempt to list them. Some names may be missing. Some may be listed twice by mistake, especially if they've committed code under more than one name. Names are as they appeared in `git log`, or as they're familiar to the Mycroft community, and are in no particular order. If you notice a name is missing (especially your own!) please notify the LF maintainers. This list was created by naively parsing Mycroft-core's `git log`, and we may have missed a spot. Some notes from @penrods: > \ > > For history's sake, here are a few tidbits: > > * This grew from format.py and parse.py, originally in mycroft.util > * history of [parse.py](https://github.com/MycroftAI/mycroft-core/commits/release/v19.2.4?after=d0f55186cb4fbdf6b2fd18b218e004568124516f+34&branch=release%2Fv19.2.4&path%5B%5D=mycroft&path%5B%5D=util&path%5B%5D=parse.py) > * history of [format.py](https://github.com/MycroftAI/mycroft-core/commits/release/v19.2.4?after=d0f55186cb4fbdf6b2fd18b218e004568124516f+34&branch=release%2Fv19.2.4&path%5B%5D=mycroft&path%5B%5D=util&path%5B%5D=format.py) > * First real community contributions came during a sprint at PyCon 2017 in Portland. Several people (especially @ashwinjv and @ProsperousHeart) converted some PHP utilities for parsing/formatting I'd written as part of Christopher. > * I'd always intended this to be generic, but I believe it was @JarbasAl who really lead the push to separate it to a fully independent library. > > \ @ChanceNCounter's note: Penrod's right, it was Jarbas who got the fork off the ground. That's part of why he's credited with so many thousands of lines in this repo. The other reason is because he's written many thousands of lines in this repo. --- penrods Jarbas Ale Åke Forslund Andreas Lorensen Angel Docampo ashwinjv Augusto Monteiro Carsten Agerskov Cakeh ChanceNCounter Chris Rogers Connor Penrod c0r73x danielwine f-e-l-i-x gras64 G3RB3N Kévin C Matthew D. Scholefield maxbachmann Michalng Michael Nguyen Mike Woudenberg ProsperousHeart Silvia O'Dwyer SoloVeniaASaludar Thomas Doczkal lingua-franca-release-v0.4.3/project-structure.md000066400000000000000000000104131426211343400220560ustar00rootroot00000000000000# Project Structure and Notes ## Source code layout * user-facing functions live here lingua_franca/ ├─ __init__.py * (exposes certain internal functions) ├─ format.py * ├─ internal.py ├─ time.py * ├─ parse.py * ├─ lang/ (localized functions and basic language data) │ ├─ common_data_<>.py (data structures related to language '<>') │ ├─ format_<>.py (localized formatters) │ ├─ parse_<>.py (localized parsers) ├─ res/ (fully localized data, 'en-us' vs 'en-au' and etc.) │ ├─ text/ │ │ ├─ / │ │ │ ├─ date_time.json │ │ │ ├─ common words ---- ## Adding new languages Ensure that all supported languages are registered in `lingua_franca.internal.py`, in the list `_SUPPORTED_LANGUAGES`. ## Localizing functions If you are localizing an existing top-level function, there is no need to alter the top-level module to which your function belongs. Lingua Franca will discover all localized versions of its top-level functions. Localized functions live in `lingua_franca/lang/`, in files named for their corresponding module. >For example, the top level formatting module is `lingua_franca.format`, and lives at `lingua_franca/format.py`. >English formatters live in `lingua_franca/lang/format_en.py`. >Spanish formatters live in `lingua_franca/lang/format_es.py`. >Spanish *parsers*, corresponding to `lingua_franca.parse` and `lingua_franca/parse.py`, >live in `lingua_franca/lang/parse_es.py`. Note that these use a *primary* language code, such as `en` or `es`, rather than a *full* language code, such as `en-US` or `es-ES`. Details relating to regional dialects reside in `res`. Lingua Franca will find your function by itself, as long as - Your files are named properly - Your function and its signature are named and organized properly (described below) and - Your primary language code is registered as a supported language with Lingua Franca itself, in `lingua_franca.internal._SUPPORTED_LANGUAGES` What you must do: - Implement the function with its uniform name, using the appropriate language code. - `lingua_franca.lang.format_en.pronounce_number_en` - `lingua_franca.lang.format_es.pronounce_number_es` - `lingua_franca.lang.format_pt.pronounce_number_pt` - Name function arguments exactly as they are named in the top-level modules - You do not need to implement all arguments, but you must name them identically - All arguments must be keyword arguments (except the primary arguments) - If you need to add new arguments, feel free, but MAKE SURE you add the argument to the top-level function, as a keyword arg. This is the only time you should need to modify the top-level functions while localizing. Ensure that any new arguments are at the end of the function signatures, both in the top-level function, and in your localized function. ## Adding new functions Ensure that all functions which will have localized versions are registered in their module's `_REGISTERED_FUNCTIONS` tuple, conventionally defined near the top. For example, formatters which have been or will be localized are registered in `lingua_franca.format._REGISTERED_FUNCTIONS`, by name only. As of July, 2020, this tuple looks as follows: ```python3 # lingua_franca/format.py _REGISTERED_FUNCTIONS = ("nice_number", "nice_time", "pronounce_number", "nice_response") ``` All top-level functions which have localized versions are wrapped using `lingua_franca.internal.localized_function`, like so: @localized_function def foo(bar, baz): pass Note that this function is a pass-through, and would not be executed even if it had logic of its own. The wrapper can fall back on the wrapped function if it is passed an error or errors as triggers: @localized_function(run_own_code_on=[ValueError, IndexError]) def func(bar, baz): print("Something happened!") In the above example, calling `func(x, y)` will usually find and call a localized version of `func`. However, if during that process something raises a `ValueError` or `IndexError`, then (and only then) it will execute `func` itself, and print "Something happened!"lingua-franca-release-v0.4.3/readme.md000066400000000000000000000377231426211343400176240ustar00rootroot00000000000000[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE.md) [![CLA](https://img.shields.io/badge/CLA%3F-Required-blue.svg)](https://mycroft.ai/cla) [![Team](https://img.shields.io/badge/Team-Languages-violetblue.svg)](https://github.com/MycroftAI/contributors/blob/master/team/Languages.md) ![Status](https://img.shields.io/badge/-Alpha-orange.svg) [![Build Status](https://travis-ci.org/MycroftAI/lingua-franca.svg?branch=master)](https://travis-ci.org/MycroftAI/lingua-franca) [![Coverage Status](https://coveralls.io/repos/github/MycroftAI/lingua-franca/badge.svg?branch=master)](https://coveralls.io/github/MycroftAI/lingua-franca?branch=master) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](http://makeapullrequest.com) [![Join chat](https://img.shields.io/badge/Mattermost-join_chat-brightgreen.svg)](https://chat.mycroft.ai/community/channels/languages) # Lingua Franca Mycroft's multilingual text parsing and formatting library Lingua Franca (_noun_)
> a framework that is adopted as the common language between speakers with different native tongues - [Lingua Franca](#lingua-franca) - [Formatting](#formatting) - [Pronounce numbers](#pronounce-numbers) - [Pronounce datetime objects](#pronounce-datetime-objects) - [Pronounce durations](#pronounce-durations) - [Parsing](#parsing) - [Extract numbers](#extract-numbers) - [Extract durations](#extract-durations) - [Extract dates](#extract-dates) - [Contributing to this project](#contributing-to-this-project) - [0. Sign a Contributor Licensing Agreement](#0-sign-a-contributor-licensing-agreement) - [1. Setup a local copy of the project](#1-setup-a-local-copy-of-the-project) - [2. Writing tests](#2-writing-tests) - [3. Run tests to confirm they fail](#3-run-tests-to-confirm-they-fail) - [4. Write code](#4-write-code) - [5. Document your code](#5-document-your-code) - [6. Try it in Mycroft](#6-try-it-in-mycroft) - [7. Commit changes](#7-commit-changes) - [8. Submit a PR](#8-submit-a-pr) - [9. Waiting for a review](#9-waiting-for-a-review) - [Credits](#credits) ## Formatting Convert data into spoken equivalents ### Pronounce numbers spoken versions of numbers ```python from lingua_franca.format import nice_number, pronounce_number assert nice_number(25/6) == "4 and a sixth" assert nice_number(201) == "201" assert nice_number(3.14159269) == "3 and a seventh" assert pronounce_number(3.14159269) == "three point one four" assert pronounce_number(0) == "zero" assert pronounce_number(10) == "ten" assert pronounce_number(201) == "two hundred and one" assert pronounce_number(102.3) == "one hundred and two point three" assert pronounce_number( 4092949192) == "four billion, ninety two million, nine hundred and forty nine thousand, one hundred and ninety two" assert pronounce_number(100034000000299792458, short_scale=True) == \ "one hundred quintillion, thirty four quadrillion, " \ "two hundred and ninety nine million, seven hundred and ninety " \ "two thousand, four hundred and fifty eight" assert pronounce_number(100034000000299792458, short_scale=False) == \ "one hundred trillion, thirty four thousand billion, " \ "two hundred and ninety nine million, seven hundred and ninety " \ "two thousand, four hundred and fifty eight" ``` ### Pronounce datetime objects spoken date for datetime.datetime objects ```python from lingua_franca.format import nice_date, nice_date_time, nice_time import datetime dt = datetime.datetime(2017, 1, 31, 13, 22, 3) assert nice_date(dt) == "tuesday, january thirty-first, twenty seventeen" assert nice_time(dt) == "one twenty two" assert nice_time(dt, use_ampm=True) == "one twenty two p.m." assert nice_time(dt, speech=False) == "1:22" assert nice_time(dt, speech=False, use_ampm=True) == "1:22 PM" assert nice_time(dt, speech=False, use_24hour=True) == "13:22" assert nice_time(dt, speech=False, use_24hour=True, use_ampm=True) == "13:22" assert nice_time(dt, use_24hour=True, use_ampm=True) == "thirteen twenty two" assert nice_time(dt, use_24hour=True, use_ampm=False) == "thirteen twenty two" assert nice_date_time(dt) == "tuesday, january thirty-first, twenty seventeen at one twenty two" ``` ### Pronounce durations spoken number of seconds or datetime.timedelta objects ```python from lingua_franca.format import nice_duration assert nice_duration(1) == "one second" assert nice_duration(3) == "three seconds" assert nice_duration(1, speech=False) == "0:01" assert nice_duration(61), "one minute one second" assert nice_duration(61, speech=False) == "1:01" assert nice_duration(5000) == "one hour twenty three minutes twenty seconds" assert nice_duration(5000, speech=False), "1:23:20" assert nice_duration(50000) == "thirteen hours fifty three minutes twenty seconds" assert nice_duration(50000, speech=False) == "13:53:20" assert nice_duration(500000) == "five days eighteen hours fifty three minutes twenty seconds" assert nice_duration(500000, speech=False), "5d 18:53:20" from datetime import timedelta assert nice_duration(timedelta(seconds=500000), speech=False) == "5d 18:53:20" ``` ## Parsing Extract data from natural language text ### Extract numbers ```python from lingua_franca.parse import extract_number, extract_numbers # extract a number assert extract_number("nothing") is False assert extract_number("two million five hundred thousand tons of spinning " "metal") == 2500000 assert extract_number("six trillion") == 6000000000000.0 assert extract_number("six trillion", short_scale=False) == 6e+18 assert extract_number("1 and 3/4 cups") == 1.75 assert extract_number("1 cup and a half") == 1.5 ## extracts all numbers assert extract_numbers("nothing") == [] assert extract_numbers("this is a one twenty one test") == [1.0, 21.0] assert extract_numbers("1 dog, seven pigs, macdonald had a farm, " "3 times 5 macarena") == [1, 7, 3, 5] ``` ### Extract durations extract datetime.timedelta objects ```python ## extract durations from lingua_franca.parse import extract_duration from datetime import timedelta assert extract_duration("nothing") == (None, 'nothing') assert extract_duration("Nineteen minutes past the hour") == ( timedelta(minutes=19), "past the hour") assert extract_duration("wake me up in three weeks, four hundred ninety seven" " days, and three hundred 91.6 seconds") == ( timedelta(weeks=3, days=497, seconds=391.6), "wake me up in , , and") assert extract_duration( "The movie is one hour, fifty seven and a half minutes long") == ( timedelta(hours=1, minutes=57.5), "the movie is , long") ``` ### Extract dates extract datetime.datetime objects ```python ## extract date times from datetime import datetime from lingua_franca.parse import extract_datetime, normalize def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) assert res[0] == expected_date assert res[1] == expected_leftover testExtract("now is the time", "2017-06-27 13:04:00", "is time") testExtract("in a couple minutes", "2017-06-27 13:06:00", "") testExtract("What is the day after tomorrow's weather?", "2017-06-29 00:00:00", "what is weather") testExtract("Remind me at 10:45 pm", "2017-06-27 22:45:00", "remind me") testExtract("what is the weather on friday morning", "2017-06-30 08:00:00", "what is weather") testExtract("what is tomorrow's weather", "2017-06-28 00:00:00", "what is weather") testExtract("remind me to call mom next tuesday", "2017-07-04 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 3 weeks", "2017-07-18 00:00:00", "remind me to call mom") testExtract("set an alarm for tonight 9:30", "2017-06-27 21:30:00", "set alarm") testExtract("on the evening of june 5th 2017 remind me to call my mother", "2017-06-05 19:00:00", "remind me to call my mother") ``` ## Getting Started ### Loading a language Before using Lingua Franca's other functions, you'll need to load one or more languages into memory, using part or all of a BCP-47 language code: ```python # Load a language lingua_franca.load_language('en') # Load multiple languages at once # # If no default language is set, the first # element will become the default lingua_franca.load_languages(['en', 'es']) # Change the default language lingua_franca.set_default_language('es') ``` See the documentation for more information about loading and unloading languages. ### Calling localized functions Most of Lingua Franca's functions have been localized. You can call a function in any language you've loaded; this is always specified by the function's `lang` parameter. If you omit that parameter, the function will be called in the current default language. Example: ```python >>> from lingua_franca import load_languages, \ set_default_lang, parse >>> load_languages(['es', 'en']) >>> parse.extract_number("uno") 1 >>> parse.extract_number("one") False >>> parse.extract_number("one", lang='en') 1 >>> set_default_lang('en') >>> parse.extract_number("uno") False >>> parse.extract_number("one") 1 ``` In some languages, certain parameters have no effect, either because those parameters do not apply, or because the localization is not complete. It's important to remember that Lingua Franca is in alpha. Support for a particular language may be inconsistent, and one language's version of a complex function might be outdated compared with another. New functionality usually starts in the languages spoken by major contributors. If your language's functions are lacking, we'd love your help improving them! (See below, "Contributing.") ## Contributing to this project We welcome all contributions to Lingua Franca. To get started: ### 0. Sign a Contributor Licensing Agreement To protect yourself, the project, and users of Mycroft technologies, we require a Contributor Licensing Agreement (CLA) before accepting any code contribution. This agreement makes it crystal clear that, along with your code, you are offering a license to use it within the confines of this project. You retain ownership of the code, this is just a license. You will also be added to [our list of excellent human beings](https://github.com/MycroftAI/contributors)! Please visit https://mycroft.ai/cla to initiate this one-time signing. ### 1. Setup a local copy of the project 1. [Fork the project](https://help.github.com/articles/fork-a-repo/) to create your own copy. 2. Clone the repository and change into that directory ```bash git clone https://github.com/your-username/lingua-franca/ cd lingua-franca ``` 3. Setup a lightweight virtual environment (venv) for the project. This creates an isolated environment that can have it's own independent set of installed Python packages. ```bash python3 -m venv .venv source .venv/bin/activate ``` To exit the venv you can run `deactivate` or close the terminal window. 4. Install the package and it's dependencies ```bash pip install wheel python -m pip install . pip install pytest python setup.py install ``` 5. To check that everything is installed correctly, let's run the existing test-suite. ```bash pytest ``` ### 2. Have a look at the project's structure The package's layout is described in `project-structure.md`, along with some important notes. It's pretty intuitive, but uniform file and function names are important to Lingua Franca's operation. ### 3. Writing tests We utilize a Test Driven Development (TDD) methodology so the first step is always to add tests for whatever you want to add or fix. If it's a bug, we must not have a test that covers that specific case, so we want to add another test. If you are starting on a new language then you can take a look at the tests for other languages to get started. Tests are all located in `lingua_franca/test`. Each language should have two test files: - `test_format_lang.py` - `test_parse_lang.py` ### 4. Run tests to confirm they fail Generally, using TDD, all tests should fail when they are first added. If the test is passing when you haven't yet fixed the bug or added the functionality, something must be wrong with the test or the test runner. ```bash pytest ``` ### 5. Write code Now we can add our new code. There are three main files for each language: - `common_data_lang.py` Common data that can be used across formatting and parsing such as dictionaries of number names. - `format_lang.py` All formatting functions for this language. - `parse_lang.py` All parsing functions for this language. Since we have already written our unit tests, we can run these regularly to see our progress. ### 6. Document your code Document code using [Google-style docstrings](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). Our automated documentation tools expect that format. All functions and class methods that are expected to be called externally should include a docstring. (And those that aren't should be [prefixed with a single underscore](https://docs.python.org/3/tutorial/classes.html#private-variables). ### 7. Try it in Mycroft Lingua Franca is installed by default when you install Mycroft-core, but for development you generally have this repo cloned elsewhere on your computer. You can use your changes in Mycroft by installing it in the Mycroft virtual environment. If you added the Mycroft helper commands during setup you can just use: ```bash mycroft-pip install /path/to/your/lingua-franca ``` Otherwise you need to activate that venv manually: ```bash cd ~/mycroft-core source venv-activate.sh pip install /path/to/your/lingua-franca ``` Now, when talking with Mycroft, it will be using your development version of Lingua Franca. ### 8. Commit changes Make commits in logical units, and describe them thoroughly. If addressing documented issue, use the issue identifier at the very beginning of each commit. For instance: ```bash git commit -m "Issues-123 - Fix 'demain' date extraction in French" ``` ### 9. Submit a PR Once your changes are ready for review, [create a pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). Like commit messages, the PR title and description should properly describe the changes you have made, along with any additional information that reviewers who do not speak your language might need to understand. ### 10. Waiting for a review While you wait for a review of your contribution, why not take a moment to review some other pull requests? This is a great way to learn and help progress the queue of pull requests, which means your contribution will be seen more quickly! ## Credits Though it is now a standalone package, Lingua Franca's codebase was a spinoff from Mycroft-core. In addition to those represented in Lingua Franca's git log, a great many people contributed to this code before the spinoff. Although all are listed in MycroftAI's [List of Excellent People](https://github.com/MycroftAI/contributors), it seems proper to acknowledge the specific individuals who helped write *this* package, since they are no longer represented in `git log`. To the best of the maintainers' knowledge, all of the "lost" contributors are listed in `pre-spinoff-credits.md`. Names are listed as they appeared in `git log`, or as they are known to the Mycroft community. Those who've contributed since the spinoff are, of course, in Lingua Franca's `git log` and the GitHub "Contributors" pane. All contributors are on the List of Excellent People, regardless of when they contributed. If you contributed to the original code, and your name is missing from `pre-spinoff-credits.md`, please inform a maintainer or file an issue, so we can give credit where credit is due!lingua-franca-release-v0.4.3/requirements.txt000066400000000000000000000000261426211343400213130ustar00rootroot00000000000000python-dateutil~=2.6.0lingua-franca-release-v0.4.3/setup.py000066400000000000000000000031501426211343400175420ustar00rootroot00000000000000import os from setuptools import setup def package_files(directory): paths = [] for (path, directories, filenames) in os.walk(directory): for filename in filenames: paths.append(os.path.join('..', path, filename)) return paths def required(requirements_file): """ Read requirements file and remove comments and empty lines. """ with open(os.path.join(os.path.dirname(__file__), requirements_file), 'r') as f: requirements = f.read().splitlines() return [pkg for pkg in requirements if pkg.strip() and not pkg.startswith("#")] extra_files = package_files('lingua_franca') with open("readme.md", "r") as fh: long_description = fh.read() setup( name='lingua_franca', version='0.4.3', packages=['lingua_franca', 'lingua_franca.lang'], url='https://github.com/MycroftAI/lingua-franca', license='Apache2.0', package_data={'': extra_files}, include_package_data=True, install_requires=required('requirements.txt'), author='Mycroft AI', author_email='dev@mycroft.ai', description='Mycroft\'s multilingual text parsing and formatting library', long_description=long_description, long_description_content_type="text/markdown", classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Topic :: Text Processing :: Linguistic', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', ], ) lingua-franca-release-v0.4.3/test/000077500000000000000000000000001426211343400170105ustar00rootroot00000000000000lingua-franca-release-v0.4.3/test/__init__.py000066400000000000000000000000001426211343400211070ustar00rootroot00000000000000lingua-franca-release-v0.4.3/test/test_format.py000066400000000000000000001026651426211343400217230ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import unittest import datetime import ast import warnings import sys from pathlib import Path # TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES, # or make it public somehow from lingua_franca import load_languages, unload_languages, set_default_lang, \ get_primary_lang_code, get_active_langs, get_supported_langs from lingua_franca.internal import UnsupportedLanguageError from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time from lingua_franca.format import nice_year from lingua_franca.format import nice_duration from lingua_franca.format import pronounce_number from lingua_franca.format import date_time_format from lingua_franca.format import join_list from lingua_franca.format import nice_relative_time from lingua_franca.time import default_timezone def setUpModule(): load_languages(get_supported_langs()) # TODO spin English tests off into another file, like other languages, so we # don't have to do this confusing thing in the "master" test_format.py set_default_lang('en-us') def tearDownModule(): unload_languages(get_active_langs()) NUMBERS_FIXTURE_EN = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: 'a half', 1.333: '1 and a third', 2.666: '2 and 2 thirds', 0.25: 'a forth', 1.25: '1 and a forth', 0.75: '3 forths', 1.75: '1 and 3 forths', 3.4: '3 and 2 fifths', 16.8333: '16 and 5 sixths', 12.5714: '12 and 4 sevenths', 9.625: '9 and 5 eigths', 6.777: '6 and 7 ninths', 3.1: '3 and a tenth', 2.272: '2 and 3 elevenths', 5.583: '5 and 7 twelveths', 8.384: '8 and 5 thirteenths', 0.071: 'a fourteenth', 6.466: '6 and 7 fifteenths', 8.312: '8 and 5 sixteenths', 2.176: '2 and 3 seventeenths', 200.722: '200 and 13 eighteenths', 7.421: '7 and 8 nineteenths', 0.05: 'a twentyith' } class TestNiceNumberFormat(unittest.TestCase): tmp_var = None def set_tmp_var(self, val): self.tmp_var = val def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_EN.items(): self.assertEqual(nice_number(number), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), '5 and a half', 'should format 5.5 as 5 and a half not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), '2.333', 'should format 2.333 as 2.333 not {}'.format( nice_number(2.333, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) def test_unknown_language(self): """ An unknown / unhandled language should return the string representation of the input number. """ def bypass_warning(): self.assertEqual( nice_number(5.5, lang='as-df'), '5.5', 'should format 5.5 ' 'as 5.5 not {}'.format( nice_number(5.5, lang='as-df'))) # Should throw a warning. Would raise the same text as a # NotImplementedError, but nice_number() bypasses and returns # its input as a string self.assertWarns(UserWarning, bypass_warning) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "zero") self.assertEqual(pronounce_number(1), "one") self.assertEqual(pronounce_number(10), "ten") self.assertEqual(pronounce_number(15), "fifteen") self.assertEqual(pronounce_number(20), "twenty") self.assertEqual(pronounce_number(27), "twenty seven") self.assertEqual(pronounce_number(30), "thirty") self.assertEqual(pronounce_number(33), "thirty three") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "minus one") self.assertEqual(pronounce_number(-10), "minus ten") self.assertEqual(pronounce_number(-15), "minus fifteen") self.assertEqual(pronounce_number(-20), "minus twenty") self.assertEqual(pronounce_number(-27), "minus twenty seven") self.assertEqual(pronounce_number(-30), "minus thirty") self.assertEqual(pronounce_number(-33), "minus thirty three") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "zero point zero five") self.assertEqual(pronounce_number(-0.05), "minus zero point zero five") self.assertEqual(pronounce_number(1.234), "one point two three") self.assertEqual(pronounce_number(21.234), "twenty one point two three") self.assertEqual(pronounce_number(21.234, places=1), "twenty one point two") self.assertEqual(pronounce_number(21.234, places=0), "twenty one") self.assertEqual(pronounce_number(21.234, places=3), "twenty one point two three four") self.assertEqual(pronounce_number(21.234, places=4), "twenty one point two three four") self.assertEqual(pronounce_number(21.234, places=5), "twenty one point two three four") self.assertEqual(pronounce_number(-1.234), "minus one point two three") self.assertEqual(pronounce_number(-21.234), "minus twenty one point two three") self.assertEqual(pronounce_number(-21.234, places=1), "minus twenty one point two") self.assertEqual(pronounce_number(-21.234, places=0), "minus twenty one") self.assertEqual(pronounce_number(-21.234, places=3), "minus twenty one point two three four") self.assertEqual(pronounce_number(-21.234, places=4), "minus twenty one point two three four") self.assertEqual(pronounce_number(-21.234, places=5), "minus twenty one point two three four") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100), "one hundred") self.assertEqual(pronounce_number(666), "six hundred and sixty six") self.assertEqual(pronounce_number(1456), "fourteen fifty six") self.assertEqual(pronounce_number(103254654), "one hundred and three " "million, two hundred " "and fifty four " "thousand, six hundred " "and fifty four") self.assertEqual(pronounce_number(1512457), "one million, five hundred" " and twelve thousand, " "four hundred and fifty " "seven") self.assertEqual(pronounce_number(209996), "two hundred and nine " "thousand, nine hundred " "and ninety six") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "zero") self.assertEqual(pronounce_number(33, scientific=True), "three point three times ten to the power of one") self.assertEqual(pronounce_number(299792458, scientific=True), "two point nine nine times ten to the power of eight") self.assertEqual(pronounce_number(299792458, places=6, scientific=True), "two point nine nine seven nine two five times " "ten to the power of eight") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "one point six seven two times ten to the power of " "negative twenty seven") def test_auto_scientific_notation(self): self.assertEqual( pronounce_number(1.1e-150), "one point one times ten to the " "power of negative one hundred " "and fifty") # value is platform dependent so better not use in tests? # self.assertEqual( # pronounce_number(sys.float_info.min), "two point two two times " # "ten to the power of " # "negative three hundred " # "and eight") # self.assertEqual( # pronounce_number(sys.float_info.max), "one point seven nine " # "times ten to the power of" # " three hundred and eight") def test_large_numbers(self): self.assertEqual( pronounce_number(299792458, short_scale=True), "two hundred and ninety nine million, seven hundred " "and ninety two thousand, four hundred and fifty eight") self.assertEqual( pronounce_number(299792458, short_scale=False), "two hundred and ninety nine million, seven hundred " "and ninety two thousand, four hundred and fifty eight") self.assertEqual( pronounce_number(100034000000299792458, short_scale=True), "one hundred quintillion, thirty four quadrillion, " "two hundred and ninety nine million, seven hundred " "and ninety two thousand, four hundred and fifty eight") self.assertEqual( pronounce_number(100034000000299792458, short_scale=False), "one hundred trillion, thirty four thousand billion, " "two hundred and ninety nine million, seven hundred " "and ninety two thousand, four hundred and fifty eight") self.assertEqual( pronounce_number(10000000000, short_scale=True), "ten billion") self.assertEqual( pronounce_number(1000000000000, short_scale=True), "one trillion") # TODO maybe beautify this self.assertEqual( pronounce_number(1000001, short_scale=True), "one million, one") self.assertEqual(pronounce_number(95505896639631893), "ninety five quadrillion, five hundred and five " "trillion, eight hundred and ninety six billion, six " "hundred and thirty nine million, six hundred and " "thirty one thousand, eight hundred and ninety three") self.assertEqual(pronounce_number(95505896639631893, short_scale=False), "ninety five thousand five hundred and five billion, " "eight hundred and ninety six thousand six hundred " "and thirty nine million, six hundred and thirty one " "thousand, eight hundred and ninety three") self.assertEqual(pronounce_number(10e80, places=1), "one qesvigintillion") # TODO floating point rounding issues might happen self.assertEqual(pronounce_number(1.9874522571e80, places=9), "one hundred and ninety eight quinquavigintillion, " "seven hundred and forty five quattuorvigintillion, " "two hundred and twenty five tresvigintillion, " "seven hundred and nine uuovigintillion, " "nine hundred and ninety nine unvigintillion, " "nine hundred and eighty nine vigintillion, " "seven hundred and thirty novendecillion, nine " "hundred and nineteen octodecillion, nine hundred " "and ninety nine septendecillion, nine hundred " "and fifty five sedecillion, four hundred and " "ninety eight quinquadecillion, two hundred and " "fourteen quattuordecillion, eight hundred and " "forty five tredecillion, four hundred and " "twenty nine duodecillion, four hundred and " "forty four undecillion, three hundred and " "thirty six decillion, seven hundred and twenty " "four nonillion, five hundred and sixty nine " "octillion, three hundred and seventy five " "septillion, two hundred and thirty nine sextillion," " six hundred and seventy quintillion, five hundred " "and seventy four quadrillion, seven hundred and " "thirty nine trillion, seven hundred and forty " "eight billion, four hundred and seventy million, " "nine hundred and fifteen thousand, seventy two") self.assertEqual(pronounce_number(1.00000000000000001e150), "nine hundred and ninety nine millinillion, nine " "hundred and ninety nine uncentillion, nine hundred " "and ninety nine centillion, nine hundred and ninety" " nine nonagintillion, nine hundred and ninety nine" " octogintillion, nine hundred and eighty" " septuagintillion, eight hundred and thirty five " "sexagintillion, five hundred and ninety six " "quinquagintillion, one hundred and seventy two" " quadragintillion, four hundred and thirty seven" " noventrigintillion, three hundred and seventy four" " octotrigintillion, five hundred and ninety" " septentrigintillion, five hundred and seventy" " three sestrigintillion, one hundred and twenty " "quinquatrigintillion, fourteen quattuortrigintillion" ", thirty trestrigintillion, three hundred and " "eighteen duotrigintillion, seven hundred and ninety" " three untrigintillion, ninety one trigintillion," " one hundred and sixty four novemvigintillion, eight" " hundred and ten octovigintillion, one hundred and" " fifty four septemvigintillion, one hundred " "qesvigintillion, one hundred and twelve " "quinquavigintillion, two hundred and three " "quattuorvigintillion, six hundred and seventy " "eight tresvigintillion, five hundred and eighty " "two uuovigintillion, nine hundred and seventy six" " unvigintillion, two hundred and ninety eight " "vigintillion, two hundred and sixty eight " "novendecillion, six hundred and sixteen " "octodecillion, two hundred and twenty one " "septendecillion, one hundred and fifty one" " sedecillion, nine hundred and sixty two " "quinquadecillion, seven hundred and two" " quattuordecillion, sixty tredecillion, two hundred" " and sixty six duodecillion, one hundred and " "seventy six undecillion, five decillion, four " "hundred and forty nonillion, five hundred and" " sixty seven octillion, thirty two septillion, " "three hundred and thirty one sextillion, " "two hundred and eight quintillion, four hundred and " "three quadrillion, nine hundred and forty eight " "trillion, two hundred and thirty three billion, " "three hundred and seventy three million, five " "hundred and fifteen thousand, seven hundred and " "seventy six") # infinity self.assertEqual( pronounce_number(sys.float_info.max * 2), "infinity") self.assertEqual( pronounce_number(float("inf")), "infinity") self.assertEqual( pronounce_number(float("-inf")), "negative infinity") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "first") self.assertEqual(pronounce_number(10, ordinals=True), "tenth") self.assertEqual(pronounce_number(15, ordinals=True), "fifteenth") self.assertEqual(pronounce_number(20, ordinals=True), "twentieth") self.assertEqual(pronounce_number(27, ordinals=True), "twenty seventh") self.assertEqual(pronounce_number(30, ordinals=True), "thirtieth") self.assertEqual(pronounce_number(33, ordinals=True), "thirty third") self.assertEqual(pronounce_number(100, ordinals=True), "hundredth") self.assertEqual(pronounce_number(1000, ordinals=True), "thousandth") self.assertEqual(pronounce_number(10000, ordinals=True), "ten thousandth") self.assertEqual(pronounce_number(18691, ordinals=True), "eighteen thousand, six hundred and ninety first") self.assertEqual(pronounce_number(1567, ordinals=True), "one thousand, five hundred and sixty seventh") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, ordinals=True), "one point six seven two times ten to the negative " "twenty seventh power") self.assertEqual(pronounce_number(18e6, ordinals=True), "eighteen millionth") self.assertEqual(pronounce_number(18e12, ordinals=True, short_scale=False), "eighteen billionth") self.assertEqual(pronounce_number(18e12, ordinals=True), "eighteen trillionth") self.assertEqual(pronounce_number(18e18, ordinals=True, short_scale=False), "eighteen " "trillionth") # def nice_time(dt, lang="en-us", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat(unittest.TestCase): @classmethod def setUpClass(cls): # Read date_time_test.json files for test data cls.test_config = {} p = Path(date_time_format.config_path) for sub_dir in [x for x in p.iterdir() if x.is_dir()]: if (sub_dir / 'date_time_test.json').exists(): print("Getting test for " + str(sub_dir / 'date_time_test.json')) with (sub_dir / 'date_time_test.json').open() as f: cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt), nice_time(dt, "en-us", True, False, False)) self.assertEqual(nice_time(dt), "one twenty two") self.assertEqual(nice_time(dt, use_ampm=True), "one twenty two p.m.") self.assertEqual(nice_time(dt, speech=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "thirteen twenty two") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "thirteen twenty two") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "one o'clock") self.assertEqual(nice_time(dt, use_ampm=True), "one p.m.") self.assertEqual(nice_time(dt, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "thirteen hundred") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "thirteen hundred") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "one oh two") self.assertEqual(nice_time(dt, use_ampm=True), "one oh two p.m.") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "thirteen zero two") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "thirteen zero two") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "twelve oh two") self.assertEqual(nice_time(dt, use_ampm=True), "twelve oh two a.m.") self.assertEqual(nice_time(dt, speech=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "zero zero zero two") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "zero zero zero two") dt = datetime.datetime(2018, 2, 8, 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "one oh two") self.assertEqual(nice_time(dt, use_ampm=True), "one oh two a.m.") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "zero one zero two") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "zero one zero two") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "quarter past twelve") self.assertEqual(nice_time(dt, use_ampm=True), "quarter past twelve p.m.") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "half past five a.m.") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "quarter to two") def test_nice_date(self): for lang in self.test_config: i = 1 while (self.test_config[lang].get('test_nice_date') and self.test_config[lang]['test_nice_date'].get(str(i))): p = self.test_config[lang]['test_nice_date'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5]) print('Testing for ' + lang + ' that ' + str(dt) + ' is date ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_date(dt, lang=lang, now=now)) i = i + 1 # test all days in a year for all languages, # that some output is produced for lang in self.test_config: for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3) + datetime.timedelta(n) for n in range(368)): self.assertTrue(len(nice_date(dt, lang=lang)) > 0) def test_nice_date_time(self): # TODO: migrate these tests (in res files) to respect the new # language loading features. Right now, some of them break if # their languages are not default. for lang in self.test_config: set_default_lang(lang) i = 1 while (self.test_config[lang].get('test_nice_date_time') and self.test_config[lang]['test_nice_date_time'].get(str(i))): p = self.test_config[lang]['test_nice_date_time'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is date time ' + p['assertEqual']) self.assertEqual( p['assertEqual'], nice_date_time( dt, lang=lang, now=now, use_24hour=ast.literal_eval(p['use_24hour']), use_ampm=ast.literal_eval(p['use_ampm']))) i = i + 1 set_default_lang('en') def test_nice_year(self): for lang in self.test_config: i = 1 while (self.test_config[lang].get('test_nice_year') and self.test_config[lang]['test_nice_year'].get(str(i))): p = self.test_config[lang]['test_nice_year'][str(i)] dp = ast.literal_eval(p['datetime_param']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) print('Testing for ' + lang + ' that ' + str(dt) + ' is year ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_year( dt, lang=lang, bc=ast.literal_eval(p['bc']))) i = i + 1 # Test all years from 0 to 9999 for all languages, # that some output is produced for lang in self.test_config: print("Test all years in " + lang) for i in range(1, 9999): dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertTrue(len(nice_year(dt, lang=lang)) > 0) # Looking through the date sequence can be helpful # print(nice_year(dt, lang=lang)) def test_nice_duration(self): self.assertEqual(nice_duration(1), "one second") self.assertEqual(nice_duration(3), "three seconds") self.assertEqual(nice_duration(1, speech=False), "0:01") self.assertEqual(nice_duration(61), "one minute one second") self.assertEqual(nice_duration(61, speech=False), "1:01") self.assertEqual(nice_duration(5000), "one hour twenty three minutes twenty seconds") self.assertEqual(nice_duration(5000, speech=False), "1:23:20") self.assertEqual(nice_duration(50000), "thirteen hours fifty three minutes twenty seconds") self.assertEqual(nice_duration(50000, speech=False), "13:53:20") self.assertEqual(nice_duration(500000), "five days eighteen hours fifty three minutes twenty seconds") # nopep8 self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), speech=False), "5d 18:53:20") def test_join(self): self.assertEqual(join_list(None, "and"), "") self.assertEqual(join_list([], "and"), "") self.assertEqual(join_list(["a"], "and"), "a") self.assertEqual(join_list(["a", "b"], "and"), "a and b") self.assertEqual(join_list(["a", "b"], "or"), "a or b") self.assertEqual(join_list(["a", "b", "c"], "and"), "a, b and c") self.assertEqual(join_list(["a", "b", "c"], "or"), "a, b or c") self.assertEqual(join_list(["a", "b", "c"], "or", ";"), "a; b or c") self.assertEqual(join_list(["a", "b", "c", "d"], "or"), "a, b, c or d") self.assertEqual(join_list([1, "b", 3, "d"], "or"), "1, b, 3 or d") class TestNiceRelativeTime(unittest.TestCase): def test_format_nice_relative_time(self): base_datetime = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) two_hours_from_base = base_datetime + datetime.timedelta(hours=2) self.assertEqual( nice_relative_time(when=two_hours_from_base, relative_to=base_datetime), "2 hours" ) twoish_hours_from_base = base_datetime + datetime.timedelta(hours=2, minutes=27) self.assertEqual( nice_relative_time(when=twoish_hours_from_base, relative_to=base_datetime), "2 hours" ) seconds_from_base = base_datetime + datetime.timedelta(seconds=47) self.assertEqual( nice_relative_time(when=seconds_from_base, relative_to=base_datetime), "47 seconds" ) three_days_from_base = base_datetime + datetime.timedelta(days=3) self.assertEqual( nice_relative_time(when=three_days_from_base, relative_to=base_datetime), "3 days" ) almost_four_days_from_base = base_datetime + datetime.timedelta(days=3, hours=20) self.assertEqual( nice_relative_time(when=almost_four_days_from_base, relative_to=base_datetime), "4 days" ) long_time_from_base = base_datetime + datetime.timedelta(days=957, hours=2, seconds=12) self.assertEqual( nice_relative_time(when=long_time_from_base, relative_to=base_datetime), "957 days" ) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_ca.py000066400000000000000000000445001426211343400223570ustar00rootroot00000000000000# # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.lang.format_ca import TimeVariantCA from lingua_franca.time import default_timezone def setUpModule(): load_language('ca-es') set_default_lang('ca') def tearDownModule(): unload_language('ca') NUMBERS_FIXTURE_CA = { 1.435634: '1,436', 2: '2', 5.0: '5', 0.027: '0,027', 0.5: 'un mig', 1.333: '1 i 1 terç', 2.666: '2 i 2 terços', 0.25: 'un quart', 1.25: '1 i 1 quart', 0.75: '3 quarts', 1.75: '1 i 3 quarts', 3.4: '3 i 2 cinquens', 16.8333: '16 i 5 sisens', 12.5714: '12 i 4 setens', 9.625: '9 i 5 vuitens', 6.777: '6 i 7 novens', 3.1: '3 i 1 desè', 2.272: '2 i 3 onzens', 5.583: '5 i 7 dotzens', 8.384: '8 i 5 tretzens', 0.071: 'catorzens', 6.466: '6 i 7 quinzens', 8.312: '8 i 5 setzens', 2.176: '2 i 3 dissetens', 200.722: '200 i 13 divuitens', 7.421: '7 i 8 dinovens', 0.05: 'un vintè' } class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0, lang="ca"), "zero") self.assertEqual(pronounce_number(1, lang="ca"), "un") self.assertEqual(pronounce_number(10, lang="ca"), "deu") self.assertEqual(pronounce_number(15, lang="ca"), "quinze") self.assertEqual(pronounce_number(21, lang="ca"), "vint-i-un") self.assertEqual(pronounce_number(27, lang="ca"), "vint-i-set") self.assertEqual(pronounce_number(30, lang="ca"), "trenta") self.assertEqual(pronounce_number(19, lang="ca"), "dinou") self.assertEqual(pronounce_number(88, lang="ca"), "vuitanta-vuit") self.assertEqual(pronounce_number(46, lang="ca"), "quaranta-sis") self.assertEqual(pronounce_number(99, lang="ca"), "noranta-nou") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1, lang="ca"), "menys un") self.assertEqual(pronounce_number(-10, lang="ca"), "menys deu") self.assertEqual(pronounce_number(-15, lang="ca"), "menys quinze") self.assertEqual(pronounce_number(-21, lang="ca"), "menys vint-i-un") self.assertEqual(pronounce_number(-27, lang="ca"), "menys vint-i-set") self.assertEqual(pronounce_number(-30, lang="ca"), "menys trenta") self.assertEqual(pronounce_number(-35, lang="ca"), "menys trenta-cinc") self.assertEqual(pronounce_number(-83, lang="ca"), "menys vuitanta-tres") self.assertEqual(pronounce_number(-19, lang="ca"), "menys dinou") self.assertEqual(pronounce_number(-88, lang="ca"), "menys vuitanta-vuit") self.assertEqual(pronounce_number(-46, lang="ca"), "menys quaranta-sis") self.assertEqual(pronounce_number(-99, lang="ca"), "menys noranta-nou") def test_convert_decimals(self): self.assertEqual(pronounce_number( 0.05, lang="ca"), "zero coma zero cinc") self.assertEqual(pronounce_number( -0.05, lang="ca"), "menys zero coma zero cinc") self.assertEqual(pronounce_number(1.234, lang="ca"), "un coma dos tres") self.assertEqual(pronounce_number(21.234, lang="ca"), "vint-i-un coma dos tres") self.assertEqual(pronounce_number(21.234, lang="ca", places=1), "vint-i-un coma dos") self.assertEqual(pronounce_number(21.234, lang="ca", places=0), "vint-i-un") self.assertEqual(pronounce_number(21.234, lang="ca", places=3), "vint-i-un coma dos tres quatre") self.assertEqual(pronounce_number(21.234, lang="ca", places=4), "vint-i-un coma dos tres quatre") self.assertEqual(pronounce_number(20.234, lang="ca", places=5), "vint coma dos tres quatre") self.assertEqual(pronounce_number(-21.234, lang="ca"), "menys vint-i-un coma dos tres") self.assertEqual(pronounce_number(-21.234, lang="ca", places=1), "menys vint-i-un coma dos") self.assertEqual(pronounce_number(-21.234, lang="ca", places=0), "menys vint-i-un") self.assertEqual(pronounce_number(-21.234, lang="ca", places=3), "menys vint-i-un coma dos tres quatre") self.assertEqual(pronounce_number(-21.234, lang="ca", places=4), "menys vint-i-un coma dos tres quatre") self.assertEqual(pronounce_number(-21.234, lang="ca", places=5), "menys vint-i-un coma dos tres quatre") class TestNiceDateFormat(unittest.TestCase): def test_pm(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt, lang="ca-es"), nice_time(dt, "ca-es", True, False, False)) self.assertEqual(nice_time(dt, lang="ca"), "la una i vint-i-dos") self.assertEqual(nice_time(dt, lang="ca", use_ampm=True), "la una i vint-i-dos de la tarda") self.assertEqual(nice_time(dt, lang="ca", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=True), "les tretze i vint-i-dos") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les tretze i vint-i-dos") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca"), "la una en punt") self.assertEqual(nice_time(dt, lang="ca", use_ampm=True), "la una en punt de la tarda") self.assertEqual(nice_time(dt, lang="ca", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=True), "les tretze") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca", use_24hour=True), "les tretze i dos") self.assertEqual(nice_time(dt, lang="ca", use_ampm=True), "la una i dos de la tarda") self.assertEqual(nice_time(dt, lang="ca", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=True), "les tretze i dos") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les tretze i dos") dt = datetime.datetime(2017, 1, 31, 12, 15, 0, tzinfo=default_timezone()) # Default Watch system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les dotze i quinze") # Spanish-like time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.SPANISH_LIKE), "les dotze i quart") # Catalan Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.BELL), "un quart d'una de la tarda") # Catalan Full Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.BELL), "un quart d'una de la tarda") dt = datetime.datetime(2017, 1, 31, 00, 14, 0, tzinfo=default_timezone()) # Default Watch system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les zero i catorze") # Spanish-like time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.SPANISH_LIKE), "les dotze i catorze") # Catalan Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.BELL), "les dotze i catorze minuts de la nit") # Catalan Full Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant=TimeVariantCA.FULL_BELL), "un quart d'una de la matinada") def test_midnight(self): dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca"), "les dotze i dos") self.assertEqual(nice_time(dt, lang="ca", use_ampm=True), "les dotze i dos de la nit") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True), "les zero i dos") self.assertEqual(nice_time(dt, lang="ca", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="ca", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=True), "les zero i dos") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les zero i dos") def test_midday(self): dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "les dotze i quinze") self.assertEqual(nice_time(dt, lang="ca-es", use_ampm=True), "les dotze i quinze del migdia") self.assertEqual(nice_time(dt, lang="ca-es", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True, use_ampm=True), "les dotze i quinze") self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True, use_ampm=False), "les dotze i quinze") def test_minutes_to_hour(self): # "twenty minutes to midnight" dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "les set i quaranta") self.assertEqual(nice_time(dt, lang="ca-es", use_ampm=True), "les set i quaranta del vespre") self.assertEqual(nice_time(dt, lang="ca-es", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="ca-es", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True, use_ampm=True), "les dinou i quaranta") self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True, use_ampm=False), "les dinou i quaranta") def test_minutes_past_hour(self): # "quarter past ten" dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True), "la una i quinze") self.assertEqual(nice_time(dt, lang="ca-es"), "la una i quinze") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "la una i trenta-cinc") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "la una i quaranta-cinc") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "les quatre i cinquanta") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es"), "les cinc i cinquanta-cinc") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es", use_ampm=True), "les cinc i trenta de la matinada") dt = datetime.datetime(2017, 1, 31, 23, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=True, use_ampm=True), "les vint-i-tres i quinze") self.assertEqual(nice_time(dt, lang="ca-es", use_24hour=False, use_ampm=True), "les onze i quinze de la nit") def test_variant_strings(self): dt = datetime.datetime(2017, 1, 31, 12, 15, 0, tzinfo=default_timezone()) # Default variant self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant="default"), "les dotze i quinze") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False), "les dotze i quinze") dt = datetime.datetime(2017, 1, 31, 00, 14, 0, tzinfo=default_timezone()) # Spanish-like time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant="spanish"), "les dotze i catorze") # Catalan Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant="bell"), "les dotze i catorze minuts de la nit") # Catalan Full Bell time system self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant="full_bell"), "un quart d'una de la matinada") self.assertEqual(nice_time(dt, lang="ca", use_24hour=True, use_ampm=False, variant="traditional"), "un quart d'una de la matinada") # error with self.assertRaises(ValueError): nice_time(dt, lang="ca", variant="invalid") nice_time(dt, lang="ca", variant="bad_VARIANT") nice_time(dt, lang="ca", variant="") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_common.py000066400000000000000000000020271426211343400232620ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from lingua_franca.lang.format_common import convert_to_mixed_fraction as cmf class TestMixedFraction(unittest.TestCase): def test_convert_to_fraction(self): self.assertEqual(cmf(8), (8, 0, 1)) self.assertEqual(cmf(8.00001), (8, 0, 1)) self.assertEqual(cmf(8.5), (8, 1, 2)) self.assertEqual(cmf(8.587465135), None) self.assertEqual(cmf(8.587465135, range(1, 101)), (8, 47, 80)) lingua-franca-release-v0.4.3/test/test_format_cs.py000066400000000000000000000761201426211343400224040ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import unittest import datetime import ast import sys from pathlib import Path from lingua_franca import get_default_lang, set_default_lang, \ load_language, unload_language from lingua_franca.format import date_time_format from lingua_franca.format import join_list from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time from lingua_franca.format import nice_duration from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_year from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language("cs-cz") set_default_lang("cs") def tearDownModule(): unload_language("cs") NUMBERS_FIXTURE_CS = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: 'polovina', 1.333: '1 a třetina', 2.666: '2 a 2 třetiny', 0.25: 'čtvrtina', 1.25: '1 a čtvrtina', 0.75: '3 čtvrtiny', 1.75: '1 a 3 čtvrtiny', 3.4: '3 a 2 pětiny', 16.8333: '16 a 5 šestin', 12.5714: '12 a 4 sedminy', 9.625: '9 a 5 osmin', 6.777: '6 a 7 devítin', 3.1: '3 a desetina', 2.272: '2 a 3 jedenáctiny', 5.583: '5 a 7 dvanáctin', 8.384: '8 a 5 třináctin', 0.071: 'čtrnáctina', 6.466: '6 a 7 patnáctin', 8.312: '8 a 5 šestnáctin', 2.176: '2 a 3 sedmnáctiny', 200.722: '200 a 13 osmnáctin', 7.421: '7 a 8 devatenáctin', 0.05: 'dvacetina' } class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_CS.items(): self.assertEqual(nice_number(number, speech=True), number_str, 'měl by zformátovat {} jako {}, ne {}'.format( number, number_str, nice_number(number, speech=True))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, speech=True, denominators=[1, 2, 3]), '5 a polovina', 'měl by zformátovat 5.5 jako 5 a půl, ne {}'.format( nice_number(5.5, speech=True, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, speech=True, denominators=[1, 2]), '2.333', 'měl by zformátovat 2.333 jako 2.333, ne {}'.format( nice_number(2.333, speech=True, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'měl by zformátovat 6.777 jako 6 7/9 ne {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'měl by zformátovat 6.0 jako 6 ne {}'.format( nice_number(6.0, speech=False))) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "nula") self.assertEqual(pronounce_number(1), "jedna") self.assertEqual(pronounce_number(10), "deset") self.assertEqual(pronounce_number(15), "patnáct") self.assertEqual(pronounce_number(20), "dvacet") self.assertEqual(pronounce_number(27), "dvacet sedm") self.assertEqual(pronounce_number(30), "třicet") self.assertEqual(pronounce_number(33), "třicet tři") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "mínus jedna") self.assertEqual(pronounce_number(-10), "mínus deset") self.assertEqual(pronounce_number(-15), "mínus patnáct") self.assertEqual(pronounce_number(-20), "mínus dvacet") self.assertEqual(pronounce_number(-27), "mínus dvacet sedm") self.assertEqual(pronounce_number(-30), "mínus třicet") self.assertEqual(pronounce_number(-33), "mínus třicet tři") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "nula tečka nula pět") self.assertEqual(pronounce_number(-0.05), "mínus nula tečka nula pět") self.assertEqual(pronounce_number(1.234), "jedna tečka dva tři") self.assertEqual(pronounce_number(21.234), "dvacet jedna tečka dva tři") self.assertEqual(pronounce_number(21.234, places=1), "dvacet jedna tečka dva") self.assertEqual(pronounce_number(21.234, places=0), "dvacet jedna") self.assertEqual(pronounce_number(21.234, places=3), "dvacet jedna tečka dva tři čtyři") self.assertEqual(pronounce_number(21.234, places=4), "dvacet jedna tečka dva tři čtyři") self.assertEqual(pronounce_number(21.234, places=5), "dvacet jedna tečka dva tři čtyři") self.assertEqual(pronounce_number(-1.234), "mínus jedna tečka dva tři") self.assertEqual(pronounce_number(-21.234), "mínus dvacet jedna tečka dva tři") self.assertEqual(pronounce_number(-21.234, places=1), "mínus dvacet jedna tečka dva") self.assertEqual(pronounce_number(-21.234, places=0), "mínus dvacet jedna") self.assertEqual(pronounce_number(-21.234, places=3), "mínus dvacet jedna tečka dva tři čtyři") self.assertEqual(pronounce_number(-21.234, places=4), "mínus dvacet jedna tečka dva tři čtyři") self.assertEqual(pronounce_number(-21.234, places=5), "mínus dvacet jedna tečka dva tři čtyři") def test_convert_stos(self): self.assertEqual(pronounce_number(100), "jedna sto") self.assertEqual(pronounce_number(666), "šest sto a šedesát šest") self.assertEqual(pronounce_number(1456), "čtrnáct padesát šest") self.assertEqual(pronounce_number(103254654), "jedna sto a tři " "million, dva sto " "a padesát čtyři " "tisíc, šest sto " "a padesát čtyři") self.assertEqual(pronounce_number(1512457), "jedna million, pět sto" " a dvanáct tisíc, " "čtyři sto a padesát " "sedm") self.assertEqual(pronounce_number(209996), "dva sto a devět " "tisíc, devět sto " "a devadesát šest") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "nula") self.assertEqual(pronounce_number(33, scientific=True), "tři tečka tři krát deset na mocninu jedna") self.assertEqual(pronounce_number(299792458, scientific=True), "dva tečka devět devět krát deset na mocninu osm") self.assertEqual(pronounce_number(299792458, places=6, scientific=True), "dva tečka devět devět sedm devět dva pět krát " "deset na mocninu osm") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "jedna tečka šest sedm dva krát deset na mocninu " "záporné dvacet sedm") def test_auto_scientific_notation(self): self.assertEqual( pronounce_number(1.1e-150), "jedna tečka jedna krát deset na " "mocninu záporné jedna sto " "a padesát") # value is platform dependent so better not use in tests? # self.assertEqual( # pronounce_number(sys.float_info.min), "dva tečka dva dva times " # "ten na mocninu " # "negative tři sto " # "a osm") # self.assertEqual( # pronounce_number(sys.float_info.max), "jedna tečka sedm devět " # "krát deset na mocninu" # " tři sto a osm") def test_large_numbers(self): self.assertEqual( pronounce_number(299792458, short_scale=True), "dva sto a devadesát devět million, sedm sto " "a devadesát dva tisíc, čtyři sto a padesát osm") self.assertEqual( pronounce_number(299792458, short_scale=False), "dva sto a devadesát devět milion, sedm sto " "a devadesát dva tisíc, čtyři sto a padesát osm") self.assertEqual( pronounce_number(100034000000299792458, short_scale=True), "jedna sto quintillion, třicet čtyři quadrillion, " "dva sto a devadesát devět million, sedm sto " "a devadesát dva tisíc, čtyři sto a padesát osm") self.assertEqual( pronounce_number(100034000000299792458, short_scale=False), "jedna sto bilion, třicet čtyři tisíc miliarda," " dva sto a devadesát devět milion, sedm sto" " a devadesát dva tisíc, čtyři sto a padesát osm") self.assertEqual( pronounce_number(10000000000, short_scale=True), "deset billion") self.assertEqual( pronounce_number(1000000000000, short_scale=True), "jedna trillion") # TODO maybe beautify this self.assertEqual( pronounce_number(1000001, short_scale=True), "jedna million, jedna") self.assertEqual(pronounce_number(95505896639631893, short_scale=True), "devadesát pět quadrillion, pět sto a pět " "trillion, osm sto a devadesát šest billion, šest " "sto a třicet devět million, šest sto a " "třicet jedna tisíc, osm sto a devadesát tři") self.assertEqual(pronounce_number(95505896639631893, short_scale=False), "devadesát pět tisíc pět sto a pět miliarda, " "osm sto a devadesát šest tisíc šest sto " "a třicet devět milion, šest sto a třicet jedna " "tisíc, osm sto a devadesát tři") self.assertEqual(pronounce_number(10e80, places=1), "jedna qesvigintillion") # TODO floating point rounding issues might happen self.maxDiff = None self.assertEqual(pronounce_number(1.9874522571e80, places=9), "jedna sto a devadesát osm quinquavigintillion, " "sedm sto a čtyřicet pět quattuorvigintillion, " "dva sto a dvacet pět tresvigintillion, " "sedm sto a devět uuovigintillion, " "devět sto a devadesát devět unvigintillion, " "devět sto a osmdesát devět vigintillion, " "sedm sto a třicet novemdecillion, devět " "sto a devatenáct octodecillion, devět sto " "a devadesát devět septendecillion, devět sto " "a padesát pět sexdecillion, čtyři sto a " "devadesát osm quindecillion, dva sto a " "čtrnáct quadrdecillion, osm sto a " "čtyřicet pět tredecillion, čtyři sto a " "dvacet devět duodecillion, čtyři sto a " "čtyřicet čtyři undecillion, tři sto a " "třicet šest decillion, sedm sto a dvacet " "čtyři nonillion, pět sto a šedesát devět " "octillion, tři sto a sedmdesát pět " "septillion, dva sto a třicet devět sextillion," " šest sto a sedmdesát quintillion, pět sto " "a sedmdesát čtyři quadrillion, sedm sto a " "třicet devět trillion, sedm sto a čtyřicet" " osm billion, čtyři sto a sedmdesát million, " "devět sto a patnáct tisíc, sedmdesát dva") self.assertEqual(pronounce_number(1.00000000000000001e150), "devět sto a devadesát devět millinillion, devět " "sto a devadesát devět uncentillion, devět sto " "a devadesát devět centillion, devět sto a devadesát" " devět nonagintillion, devět sto a devadesát devět" " octogintillion, devět sto a osmdesát" " septuagintillion, osm sto a třicet pět " "sexagintillion, pět sto a devadesát šest " "quinquagintillion, jedna sto a sedmdesát dva" " quadragintillion, čtyři sto a třicet sedm" " noventrigintillion, tři sto a sedmdesát čtyři" " octotrigintillion, pět sto a devadesát" " septentrigintillion, pět sto a sedmdesát" " tři sestrigintillion, jedna sto a dvacet " "quinquatrigintillion, čtrnáct quattuortrigintillion" ", třicet trestrigintillion, tři sto a " "osmnáct duotrigintillion, sedm sto a devadesát" " tři untrigintillion, devadesát jedna trigintillion," " jedna sto a šedesát čtyři novemvigintillion, osm" " sto a deset octovigintillion, jedna sto a" " padesát čtyři septemvigintillion, jedna sto " "qesvigintillion, jedna sto a dvanáct " "quinquavigintillion, dva sto a tři " "quattuorvigintillion, šest sto a sedmdesát " "osm tresvigintillion, pět sto a osmdesát " "dva uuovigintillion, devět sto a sedmdesát šest" " unvigintillion, dva sto a devadesát osm " "vigintillion, dva sto a šedesát osm " "novemdecillion, šest sto a šestnáct " "octodecillion, dva sto a dvacet jedna " "septendecillion, jedna sto a padesát jedna" " sexdecillion, devět sto a šedesát dva " "quindecillion, sedm sto a dva" " quadrdecillion, šedesát tredecillion, dva sto" " a šedesát šest duodecillion, jedna sto a " "sedmdesát šest undecillion, pět decillion, čtyři " "sto a čtyřicet nonillion, pět sto a" " šedesát sedm octillion, třicet dva septillion, " "tři sto a třicet jedna sextillion, " "dva sto a osm quintillion, čtyři sto a " "tři quadrillion, devět sto a čtyřicet osm " "trillion, dva sto a třicet tři billion, " "tři sto a sedmdesát tři million, pět " "sto a patnáct tisíc, sedm sto a " "sedmdesát šest") # infinity self.assertEqual( pronounce_number(sys.float_info.max * 2), "nekonečno") self.assertEqual( pronounce_number(float("inf")), "nekonečno") self.assertEqual( pronounce_number(float("-inf")), "záporné nekonečno") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "první") self.assertEqual(pronounce_number(10, ordinals=True), "desátý") self.assertEqual(pronounce_number(15, ordinals=True), "patnáctý") self.assertEqual(pronounce_number(20, ordinals=True), "dvacátý") self.assertEqual(pronounce_number(27, ordinals=True), "dvacet sedmý") self.assertEqual(pronounce_number(30, ordinals=True), "třicátý") self.assertEqual(pronounce_number(33, ordinals=True), "třicet třetí") self.assertEqual(pronounce_number(100, ordinals=True), "stý") self.assertEqual(pronounce_number(1000, ordinals=True), "tisící") self.assertEqual(pronounce_number(10000, ordinals=True), "deset tisící") self.assertEqual(pronounce_number(18691, ordinals=True), "osmnáct tisíc, šest sto a devadesát první") self.assertEqual(pronounce_number(1567, ordinals=True), "jedna tisíc, pět sto a šedesát sedmý") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, ordinals=True), "jedna tečka šest sedm dva krát deset k záporné " "dvacet sedmý mocnině") self.assertEqual(pronounce_number(18e6, ordinals=True), "osmnáct milliontý") self.assertEqual(pronounce_number(18e12, ordinals=True, short_scale=False), "osmnáct biliontý") self.assertEqual(pronounce_number(18e12, ordinals=True), "osmnáct trilliontý") self.assertEqual(pronounce_number(18e18, ordinals=True, short_scale=False), "osmnáct " "triliontý") class TestNiceDateFormat(unittest.TestCase): @classmethod def setUpClass(cls): # Read date_time_test.json files for test data cls.test_config = {} p = Path(date_time_format.config_path) for sub_dir in [x for x in p.iterdir() if x.is_dir()]: if (sub_dir / 'date_time_test.json').exists(): print("Načítám test pro " + str(sub_dir / 'date_time_test.json')) with (sub_dir / 'date_time_test.json').open() as f: cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt), nice_time(dt, speech=True, use_24hour=True, use_ampm=False)) self.assertEqual(nice_time(dt, use_24hour=False), "jedna dvacet dva") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "jedna dvacet dva p.m.") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "třináct dvacet dva") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "třináct dvacet dva") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "jedna hodin") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "jedna p.m.") self.assertEqual(nice_time(dt, use_24hour=False, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "třináct sto") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "třináct sto") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "jedna oh dva") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "jedna oh dva p.m.") self.assertEqual(nice_time(dt, use_24hour=False, speech=False), "1:02") self.assertEqual(nice_time(dt, use_24hour=False, speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "třináct nula dva") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "třináct nula dva") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "dvanáct oh dva") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "dvanáct oh dva a.m.") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "nula nula nula dva") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "nula nula nula dva") dt = datetime.datetime(2018, 2, 8, 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "jedna oh dva") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "jedna oh dva a.m.") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "nula jedna nula dva") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "nula jedna nula dva") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "čtvrt po dvanáct") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "čtvrt po dvanáct p.m.") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "půl po pět a.m.") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "třičtvrtě na dva") def test_nice_date(self): lang = "cs-cz" i = 1 while (self.test_config[lang].get('test_nice_date') and self.test_config[lang]['test_nice_date'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_date'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is date ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_date(dt, lang=lang, now=now)) i = i + 1 # test fall back to english !!!Skiped #dt = datetime.datetime(2018, 2, 4, 0, 2, 3, tzinfo=default_timezone()) # self.assertEqual(nice_date( # dt, lang='invalid', now=datetime.datetime(2018, 2, 4, 0, 2, 3)), # 'today') # test all days in a year for all languages, # that some output is produced # for lang in self.test_config: for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3, tzinfo=default_timezone()) + datetime.timedelta(n) for n in range(368)): self.assertTrue(len(nice_date(dt, lang=lang)) > 0) def test_nice_date_time(self): lang = "cs-cz" i = 1 while (self.test_config[lang].get('test_nice_date_time') and self.test_config[lang]['test_nice_date_time'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_date_time'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5]) print('Testing for ' + lang + ' that ' + str(dt) + ' is date time ' + p['assertEqual']) self.assertEqual( p['assertEqual'], nice_date_time( dt, lang=lang, now=now, use_24hour=ast.literal_eval(p['use_24hour']), use_ampm=ast.literal_eval(p['use_ampm']))) i = i + 1 def test_nice_year(self): lang = "cs-cz" i = 1 while (self.test_config[lang].get('test_nice_year') and self.test_config[lang]['test_nice_year'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_year'][str(i)] dp = ast.literal_eval(p['datetime_param']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is year ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_year( dt, lang=lang, bc=ast.literal_eval(p['bc']))) i = i + 1 # Test all years from 0 to 9999 for all languages, # that some output is produced print("Test all years in " + lang) for i in range(1, 9999): dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertTrue(len(nice_year(dt, lang=lang)) > 0) # Looking through the date sequence can be helpful # print(nice_year(dt, lang=lang)) def test_nice_duration(self): self.assertEqual(nice_duration(1), "jedna sekunda") self.assertEqual(nice_duration(3), "tři sekundy") self.assertEqual(nice_duration(1, speech=False), "0:01") self.assertEqual(nice_duration(61), "jedna minuta jedna sekunda") self.assertEqual(nice_duration(61, speech=False), "1:01") self.assertEqual(nice_duration(5000), "jedna hodina dvacet tři minuty dvacet sekundy") self.assertEqual(nice_duration(5000, speech=False), "1:23:20") self.assertEqual(nice_duration(50000), "třináct hodiny padesát tři minuty dvacet sekundy") self.assertEqual(nice_duration(50000, speech=False), "13:53:20") self.assertEqual(nice_duration(500000,), "pět dní osmnáct hodiny padesát tři minuty dvacet sekundy") # nopep8 self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), speech=False), "5d 18:53:20") def test_join(self): self.assertEqual(join_list(None, "a"), "") self.assertEqual(join_list([], "a"), "") self.assertEqual(join_list(["a"], "a"), "a") self.assertEqual(join_list(["a", "b"], "a"), "a a b") self.assertEqual(join_list(["a", "b"], "nebo"), "a nebo b") self.assertEqual(join_list(["a", "b", "c"], "a"), "a, b a c") self.assertEqual(join_list(["a", "b", "c"], "nebo"), "a, b nebo c") self.assertEqual( join_list(["a", "b", "c"], "nebo", ";"), "a; b nebo c") self.assertEqual( join_list(["a", "b", "c", "d"], "nebo"), "a, b, c nebo d") self.assertEqual(join_list([1, "b", 3, "d"], "nebo"), "1, b, 3 nebo d") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_da.py000066400000000000000000000422261426211343400223630ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number, nice_time, nice_response, \ pronounce_number # from lingua_franca.format import nice_time # from lingua_franca.format import pronounce_number # # from mycroft_parsers.lang.format_da import nice_response from lingua_franca.lang.format_da import pronounce_ordinal_da # internal to da from lingua_franca.time import default_timezone def setUpModule(): load_language('da') set_default_lang('da') def tearDownModule(): unload_language('da') # fractions are not capitalized for now NUMBERS_FIXTURE_da = { 1.435634: '1,436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345,679', 0.027: '0,027', 0.5: '1 halv', 1.333: '1 og 1 trediedel', 2.666: '2 og 2 trediedele', 0.25: '1 fjerdedel', 1.25: '1 og 1 fjerdedel', 0.75: '3 fjerdedele', 1.75: '1 og 3 fjerdedele', 3.4: '3 og 2 femtedele', 16.8333: '16 og 5 sjettedele', 12.5714: '12 og 4 syvendedele', 9.625: '9 og 5 ottendedele', 6.777: '6 og 7 niendedele', 3.1: '3 og 1 tiendedel', 2.272: '2 og 3 elftedele', 5.583: '5 og 7 tolvtedele', 8.384: '8 og 5 trettendedele', 0.071: '1 fjortendedel', 6.466: '6 og 7 femtendedele', 8.312: '8 og 5 sejstendedele', 2.176: '2 og 3 syttendedele', 200.722: '200 og 13 attendedele', 7.421: '7 og 8 nittendedele', 0.05: '1 tyvendedel' } # class TestNiceResponse(unittest.TestCase): # def test_replace_ordinal(self): # self.assertEqual(nice_response("det er den 31. maj"), # "det er den enogtredifte maj") # self.assertEqual(nice_response("Det begynder den 31. maj"), # "Det begynder den enogtrefte maj") # self.assertEqual(nice_response("den 31. mai"), # "den enogtrefte maj") # self.assertEqual(nice_response("10 ^ 2"), "ti to") class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_da.items(): self.assertEqual(nice_number(number, lang="da-dk"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number, lang="da-dk"))) def test_specify_danominator(self): self.assertEqual(nice_number(5.5, lang="da-dk", denominators=[1, 2, 3]), '5 og 1 halv', 'should format 5.5 as 5 und ein halb not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="da-dk", denominators=[1, 2]), '2,333', 'should format 2,333 as 2,333 not {}'.format( nice_number(2.333, lang="da-dk", denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="da-dk", speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="da-dk", speech=False))) class TestPronounceOrdinal(unittest.TestCase): def test_convert_int_da(self): self.assertEqual(pronounce_ordinal_da(0), "nulte") self.assertEqual(pronounce_ordinal_da(1), "første") self.assertEqual(pronounce_ordinal_da(3), "tredie") self.assertEqual(pronounce_ordinal_da(5), "femte") self.assertEqual(pronounce_ordinal_da(21), "enogtyvende") self.assertEqual(pronounce_ordinal_da(2000), "totusindende") self.assertEqual(pronounce_ordinal_da(1000), "ettusindende") # self.assertEqual(pronounce_ordinal_da(123456), # "ethundredetreogtyvetusindefirehundredeseksog\ # halvtresende") class TestPronounceNumber(unittest.TestCase): def test_convert_int_da(self): # self.assertEqual(pronounce_number(123456789123456789, lang="da-dk"), # "ethundredetreogtyvebilliarder" # "firehundredeseksoghalvtresbillioner" # "syvhundredeogfirsmiliarder" # "ethundredetreogtyvemillioner" # "firehundredeseksoghalvtrestusindesyvhundredeniog \ # firs") self.assertEqual(pronounce_number(1, lang="da-dk"), "en") self.assertEqual(pronounce_number(10, lang="da-dk"), "ti") self.assertEqual(pronounce_number(15, lang="da-dk"), "femten") self.assertEqual(pronounce_number(20, lang="da-dk"), "tyve") self.assertEqual(pronounce_number(27, lang="da-dk"), "syvogtyve") self.assertEqual(pronounce_number(30, lang="da-dk"), "tredive") self.assertEqual(pronounce_number(33, lang="da-dk"), "treogtredive") self.assertEqual(pronounce_number(71, lang="da-dk"), "enoghalvfjers") self.assertEqual(pronounce_number(80, lang="da-dk"), "firs") self.assertEqual(pronounce_number(74, lang="da-dk"), "fireoghalvfjers") self.assertEqual(pronounce_number(79, lang="da-dk"), "nioghalvfjers") self.assertEqual(pronounce_number(91, lang="da-dk"), "enoghalvfems") self.assertEqual(pronounce_number(97, lang="da-dk"), "syvoghalvfems") self.assertEqual(pronounce_number(300, lang="da-dk"), "trehundrede") def test_convert_negative_int_da(self): self.assertEqual(pronounce_number(-1, lang="da-dk"), "minus en") self.assertEqual(pronounce_number(-10, lang="da-dk"), "minus ti") self.assertEqual(pronounce_number(-15, lang="da-dk"), "minus femten") self.assertEqual(pronounce_number(-20, lang="da-dk"), "minus tyve") self.assertEqual(pronounce_number(-27, lang="da-dk"), "minus syvogtyve") self.assertEqual(pronounce_number(-30, lang="da-dk"), "minus tredive") self.assertEqual(pronounce_number(-33, lang="da-dk"), "minus treogtredive") def test_convert_dacimals_da(self): self.assertEqual(pronounce_number(1.234, lang="da-dk"), "en komma to tre") self.assertEqual(pronounce_number(21.234, lang="da-dk"), "enogtyve komma to tre") self.assertEqual(pronounce_number(21.234, lang="da-dk", places=1), "enogtyve komma to") self.assertEqual(pronounce_number(21.234, lang="da-dk", places=0), "enogtyve") self.assertEqual(pronounce_number(21.234, lang="da-dk", places=3), "enogtyve komma to tre fire") self.assertEqual(pronounce_number(21.234, lang="da-dk", places=4), "enogtyve komma to tre fire nul") self.assertEqual(pronounce_number(21.234, lang="da-dk", places=5), "enogtyve komma to tre fire nul nul") self.assertEqual(pronounce_number(-1.234, lang="da-dk"), "minus en komma to tre") self.assertEqual(pronounce_number(-21.234, lang="da-dk"), "minus enogtyve komma to tre") self.assertEqual(pronounce_number(-21.234, lang="da-dk", places=1), "minus enogtyve komma to") self.assertEqual(pronounce_number(-21.234, lang="da-dk", places=0), "minus enogtyve") self.assertEqual(pronounce_number(-21.234, lang="da-dk", places=3), "minus enogtyve komma to tre fire") self.assertEqual(pronounce_number(-21.234, lang="da-dk", places=4), "minus enogtyve komma to tre fire nul") self.assertEqual(pronounce_number(-21.234, lang="da-dk", places=5), "minus enogtyve komma to tre fire nul nul") # def nice_time(dt, lang="da-dk", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_da(unittest.TestCase): def test_convert_times_da(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "et toogtyve") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "et toogtyve om eftermiddagen") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "01:22") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "01:22 PM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "tretten toogtyve") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "tretten toogtyve") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "et") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "et om eftermiddagen") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "01:00") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "01:00 PM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "tretten") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "tretten") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "et nul to") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "et nul to om eftermiddagen") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "01:02") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "01:02 PM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "tretten nul to") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "tretten nul to") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "tolv nul to") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "tolv nul to om natten") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "nul nul to") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "nul nul to") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "tolv femten") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "tolv femten om eftermiddagen") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "tolv femten") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "tolv femten") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "syv fyrre") self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "syv fyrre om aftenen") self.assertEqual(nice_time(dt, lang="da-dk", speech=False), "07:40") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_ampm=True), "07:40 PM") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="da-dk", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=True), "nitten fyrre") self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True, use_ampm=False), "nitten fyrre") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk", use_24hour=True), "et femten") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "et femogtredive") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "et femogfyrre") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "fire halvtres") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk"), "fem femoghalvtres") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="da-dk", use_ampm=True), "fem tredive om morgenen") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_de.py000066400000000000000000000424711426211343400223710ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import get_default_lang, set_default_lang, load_language, \ unload_language from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.lang.format_de import nice_response_de from lingua_franca.lang.format_de import pronounce_ordinal_de from lingua_franca.format import join_list from lingua_franca.time import default_timezone def setUpModule(): load_language('de-de') def tearDownModule(): unload_language('de-de') # fractions are not capitalized for now NUMBERS_FIXTURE_DE = { 1.435634: '1,436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345,679', 0.027: '0,027', 0.5: 'ein halb', 1.333: '1 und ein drittel', 2.666: '2 und 2 drittel', 0.25: 'ein viertel', 1.25: '1 und ein viertel', 0.75: '3 viertel', 1.75: '1 und 3 viertel', 3.4: '3 und 2 fünftel', 16.8333: '16 und 5 sechstel', 12.5714: '12 und 4 siebtel', 9.625: '9 und 5 achtel', 6.777: '6 und 7 neuntel', 3.1: '3 und ein zehntel', 2.272: '2 und 3 elftel', 5.583: '5 und 7 zwölftel', 8.384: '8 und 5 dreizehntel', 0.071: 'ein vierzehntel', 6.466: '6 und 7 fünfzehntel', 8.312: '8 und 5 sechzehntel', 2.176: '2 und 3 siebzehntel', 200.722: '200 und 13 achtzehntel', 7.421: '7 und 8 neunzehntel', 0.05: 'ein zwanzigstel' } class TestNiceResponse(unittest.TestCase): def test_replace_ordinal(self): self.assertEqual(nice_response_de("dies ist der 31. mai"), "dies ist der einunddreißigste mai") self.assertEqual(nice_response_de("es fängt am 31. mai an"), "es fängt am einunddreißigsten mai an") self.assertEqual(nice_response_de("der 31. mai"), "der einunddreißigste mai") self.assertEqual(nice_response_de("10 ^ 2"), "10 hoch 2") class TestNiceNumberFormat(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("de-de") def tearDown(self): set_default_lang(self.old_lang) def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_DE.items(): self.assertEqual(nice_number(number), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), '5 und ein halb', 'should format 5.5 as 5 und ein halb not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), '2,333', 'should format 2,333 as 2,333 not {}'.format( nice_number(2.333, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) class TestPronounceOrdinal(unittest.TestCase): def test_convert_int_de(self): self.assertEqual(pronounce_ordinal_de(0), "nullte") self.assertEqual(pronounce_ordinal_de(1), "erste") self.assertEqual(pronounce_ordinal_de(3), "dritte") self.assertEqual(pronounce_ordinal_de(5), "fünfte") self.assertEqual(pronounce_ordinal_de(1000), "eintausendste") self.assertEqual(pronounce_ordinal_de(123456), "einhundertdreiundzwanzigtausendvierhundertsechsund" "fünfzigste") # def pronounce_number(number, lang="de-de", places=2): class TestPronounceNumber(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("de-de") def tearDown(self): set_default_lang(self.old_lang) def test_convert_int_de(self): self.assertEqual(pronounce_number(123456789123456789), "einhundertdreiundzwanzig Billiarden " "vierhundertsechsundfünfzig Billionen " "siebenhundertneunundachtzig Milliarden " "einhundertdreiundzwanzig Millionen " "vierhundertsechsundfünfzigtausendsiebenhundert" "neunundachtzig") self.assertEqual(pronounce_number(1), "eins") self.assertEqual(pronounce_number(10), "zehn") self.assertEqual(pronounce_number(15), "fünfzehn") self.assertEqual(pronounce_number(20), "zwanzig") self.assertEqual(pronounce_number(27), "siebenundzwanzig") self.assertEqual(pronounce_number(30), "dreißig") self.assertEqual(pronounce_number(33), "dreiunddreißig") self.assertEqual(pronounce_number(71), "einundsiebzig") self.assertEqual(pronounce_number(80), "achtzig") self.assertEqual(pronounce_number(74), "vierundsiebzig") self.assertEqual(pronounce_number(79), "neunundsiebzig") self.assertEqual(pronounce_number(91), "einundneunzig") self.assertEqual(pronounce_number(97), "siebenundneunzig") self.assertEqual(pronounce_number(300), "dreihundert") def test_convert_negative_int_de(self): self.assertEqual(pronounce_number(-1), "minus eins") self.assertEqual(pronounce_number(-10), "minus zehn") self.assertEqual(pronounce_number(-15), "minus fünfzehn") self.assertEqual(pronounce_number(-20), "minus zwanzig") self.assertEqual(pronounce_number(-27), "minus siebenundzwanzig") self.assertEqual(pronounce_number(-30), "minus dreißig") self.assertEqual(pronounce_number(-33), "minus dreiunddreißig") def test_convert_decimals_de(self): self.assertEqual(pronounce_number(1.234), "eins Komma zwei drei") self.assertEqual(pronounce_number(21.234), "einundzwanzig Komma zwei drei") self.assertEqual(pronounce_number(21.234, places=1), "einundzwanzig Komma zwei") self.assertEqual(pronounce_number(21.234, places=0), "einundzwanzig") self.assertEqual(pronounce_number(21.234, places=3), "einundzwanzig Komma zwei drei vier") self.assertEqual(pronounce_number(21.234, places=4), "einundzwanzig Komma zwei drei vier null") self.assertEqual(pronounce_number(21.234, places=5), "einundzwanzig Komma zwei drei vier null null") self.assertEqual(pronounce_number(-1.234), "minus eins Komma zwei drei") self.assertEqual(pronounce_number(-21.234), "minus einundzwanzig Komma zwei drei") self.assertEqual(pronounce_number(-21.234, places=1), "minus einundzwanzig Komma zwei") self.assertEqual(pronounce_number(-21.234, places=0), "minus einundzwanzig") self.assertEqual(pronounce_number(-21.234, places=3), "minus einundzwanzig Komma zwei drei vier") self.assertEqual(pronounce_number(-21.234, places=4), "minus einundzwanzig Komma zwei drei vier null") self.assertEqual(pronounce_number(-21.234, places=5), "minus einundzwanzig Komma zwei drei vier null null") # def nice_time(dt, lang="de-de", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_de(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("de-de") def tearDown(self): set_default_lang(self.old_lang) def test_convert_times_de(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "ein Uhr zweiundzwanzig") self.assertEqual(nice_time(dt, use_ampm=True), "ein Uhr zweiundzwanzig nachmittags") self.assertEqual(nice_time(dt, speech=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "dreizehn Uhr zweiundzwanzig") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "dreizehn Uhr zweiundzwanzig") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "ein Uhr") self.assertEqual(nice_time(dt, use_ampm=True), "ein Uhr nachmittags") self.assertEqual(nice_time(dt, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "dreizehn Uhr") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "dreizehn Uhr") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "ein Uhr zwei") self.assertEqual(nice_time(dt, use_ampm=True), "ein Uhr zwei nachmittags") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "dreizehn Uhr zwei") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "dreizehn Uhr zwei") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "zwölf Uhr zwei") self.assertEqual(nice_time(dt, use_ampm=True), "zwölf Uhr zwei nachts") self.assertEqual(nice_time(dt, speech=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "null Uhr zwei") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "null Uhr zwei") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "viertel eins") self.assertEqual(nice_time(dt, use_ampm=True), "viertel eins nachmittags") self.assertEqual(nice_time(dt, speech=False), "12:15") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "zwölf Uhr fünfzehn") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "zwölf Uhr fünfzehn") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "sieben Uhr vierzig") self.assertEqual(nice_time(dt, use_ampm=True), "sieben Uhr vierzig abends") self.assertEqual(nice_time(dt, speech=False), "7:40") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "neunzehn Uhr vierzig") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "neunzehn Uhr vierzig") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=True), "ein Uhr fünfzehn") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "ein Uhr fünfunddreißig") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dreiviertel zwei") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "vier Uhr fünfzig") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "fünf Uhr fünfundfünfzig") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "halb sechs morgens") class TestJoinList_de(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("de-de") def tearDown(self): set_default_lang(self.old_lang) def test_join_list_de(self): self.assertEqual(join_list(['Hallo', 'Auf wieder Sehen'], 'and'), 'Hallo und Auf wieder Sehen') self.assertEqual(join_list(['A', 'B', 'C'], 'or'), 'A, B oder C') if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_es.py000066400000000000000000000412611426211343400224040ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language('es') set_default_lang('es') def tearDownModule(): unload_language('es') NUMBERS_FIXTURE_ES = { 1.435634: '1,436', 2: '2', 5.0: '5', 0.027: '0,027', 0.5: 'un medio', 1.333: '1 y 1 tercio', 2.666: '2 y 2 tercio', 0.25: 'un cuarto', 1.25: '1 y 1 cuarto', 0.75: '3 cuartos', 1.75: '1 y 3 cuartos', 3.4: '3 y 2 quintos', 16.8333: '16 y 5 sextos', 12.5714: '12 y 4 séptimos', 9.625: '9 y 5 octavos', 6.777: '6 y 7 novenos', 3.1: '3 y 1 décimo', 2.272: '2 y 3 onceavos', 5.583: '5 y 7 doceavos', 8.384: '8 y 5 treceavos', 0.071: 'un catorceavo', 6.466: '6 y 7 quinceavos', 8.312: '8 y 5 dieciseisavos', 2.176: '2 y 3 diecisieteavos', 200.722: '200 y 13 dieciochoavos', 7.421: '7 y 8 diecinueveavos', 0.05: 'un veinteavo' } class TestNiceNumberFormat_es(unittest.TestCase): def test_convert_float_to_nice_number_es(self): for number, number_str in NUMBERS_FIXTURE_ES.items(): self.assertEqual(nice_number(number, lang="es-es"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number( number, lang="es-es"))) def test_specify_denominator_es(self): self.assertEqual(nice_number(5.5, lang="es-es", denominators=[1, 2, 3]), '5 y medio', 'should format 5.5 as 5 y medio not {}'.format( nice_number(5.5, lang="es-es", denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="es-es", denominators=[1, 2]), '2,333', 'should format 2.333 as 2,333 not {}'.format( nice_number(2.333, lang="es-es", denominators=[1, 2]))) def test_no_speech_es(self): self.assertEqual(nice_number(6.777, lang="es-es", speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="es-es", speech=False))) self.assertEqual(nice_number(6.0, lang="es-es", speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="es-es", speech=False))) self.assertEqual(nice_number(1234567890, lang="es-es", speech=False), '1 234 567 890', 'should format 1234567890 as' '1 234 567 890 not {}'.format( nice_number(1234567890, lang="es-es", speech=False))) self.assertEqual(nice_number(12345.6789, lang="es-es", speech=False), '12 345,679', 'should format 12345.6789 as' '12 345,679 not {}'.format( nice_number(12345.6789, lang="es-es", speech=False))) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0, lang="es"), "cero") self.assertEqual(pronounce_number(1, lang="es"), "uno") self.assertEqual(pronounce_number(10, lang="es"), "diez") self.assertEqual(pronounce_number(15, lang="es"), "quince") self.assertEqual(pronounce_number(21, lang="es"), "veintiuno") self.assertEqual(pronounce_number(27, lang="es"), "veintisiete") self.assertEqual(pronounce_number(30, lang="es"), "treinta") self.assertEqual(pronounce_number(19, lang="es"), "diecinueve") self.assertEqual(pronounce_number(88, lang="es"), "ochenta y ocho") self.assertEqual(pronounce_number(46, lang="es"), "cuarenta y seis") self.assertEqual(pronounce_number(99, lang="es"), "noventa y nueve") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1, lang="es"), "menos uno") self.assertEqual(pronounce_number(-10, lang="es"), "menos diez") self.assertEqual(pronounce_number(-15, lang="es"), "menos quince") self.assertEqual(pronounce_number(-21, lang="es"), "menos veintiuno") self.assertEqual(pronounce_number(-27, lang="es"), "menos veintisiete") self.assertEqual(pronounce_number(-30, lang="es"), "menos treinta") self.assertEqual(pronounce_number(-35, lang="es"), "menos treinta y cinco") self.assertEqual(pronounce_number(-83, lang="es"), "menos ochenta y tres") self.assertEqual(pronounce_number(-19, lang="es"), "menos diecinueve") self.assertEqual(pronounce_number(-88, lang="es"), "menos ochenta y ocho") self.assertEqual(pronounce_number(-46, lang="es"), "menos cuarenta y seis") self.assertEqual(pronounce_number(-99, lang="es"), "menos noventa y nueve") def test_convert_decimals(self): self.assertEqual(pronounce_number( 0.05, lang="es"), "cero coma cero cinco") self.assertEqual(pronounce_number( -0.05, lang="es"), "menos cero coma cero cinco") self.assertEqual(pronounce_number(1.234, lang="es"), "uno coma dos tres") self.assertEqual(pronounce_number(21.234, lang="es"), "veintiuno coma dos tres") self.assertEqual(pronounce_number(21.234, lang="es", places=1), "veintiuno coma dos") self.assertEqual(pronounce_number(21.234, lang="es", places=0), "veintiuno") self.assertEqual(pronounce_number(21.234, lang="es", places=3), "veintiuno coma dos tres cuatro") self.assertEqual(pronounce_number(21.234, lang="es", places=4), "veintiuno coma dos tres cuatro") self.assertEqual(pronounce_number(21.234, lang="es", places=5), "veintiuno coma dos tres cuatro") self.assertEqual(pronounce_number(-21.234, lang="es"), "menos veintiuno coma dos tres") self.assertEqual(pronounce_number(-21.234, lang="es", places=1), "menos veintiuno coma dos") self.assertEqual(pronounce_number(-21.234, lang="es", places=0), "menos veintiuno") self.assertEqual(pronounce_number(-21.234, lang="es", places=3), "menos veintiuno coma dos tres cuatro") self.assertEqual(pronounce_number(-21.234, lang="es", places=4), "menos veintiuno coma dos tres cuatro") self.assertEqual(pronounce_number(-21.234, lang="es", places=5), "menos veintiuno coma dos tres cuatro") class TestNiceDateFormat(unittest.TestCase): def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt, lang="es-es"), nice_time(dt, "es-es", True, False, False)) self.assertEqual(nice_time(dt, lang="es"), "la una y veintidós") self.assertEqual(nice_time(dt, lang="es", use_ampm=True), "la una y veintidós de la tarde") self.assertEqual(nice_time(dt, lang="es", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="es", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=True), "las trece veintidós") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=False), "las trece veintidós") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es"), "la una en punto") self.assertEqual(nice_time(dt, lang="es", use_ampm=True), "la una de la tarde") self.assertEqual(nice_time(dt, lang="es", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="es", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=True), "las trece cero cero") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es", use_24hour=True), "las trece cero dos") self.assertEqual(nice_time(dt, lang="es", use_ampm=True), "la una y dos de la tarde") self.assertEqual(nice_time(dt, lang="es", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="es", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=True), "las trece cero dos") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=False), "las trece cero dos") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es"), "las doce y dos") self.assertEqual(nice_time(dt, lang="es", use_ampm=True), "las doce y dos de la madrugada") self.assertEqual(nice_time(dt, lang="es", use_24hour=True), "las cero cero dos") self.assertEqual(nice_time(dt, lang="es", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="es", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="es", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=True), "las cero cero dos") self.assertEqual(nice_time(dt, lang="es", use_24hour=True, use_ampm=False), "las cero cero dos") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las doce y cuarto") self.assertEqual(nice_time(dt, lang="es-es", use_ampm=True), "las doce y cuarto de la mañana") self.assertEqual(nice_time(dt, lang="es-es", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True, use_ampm=True), "las doce quince") self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True, use_ampm=False), "las doce quince") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las ocho menos veinte") self.assertEqual(nice_time(dt, lang="es-es", use_ampm=True), "las ocho menos veinte de la tarde") self.assertEqual(nice_time(dt, lang="es-es", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="es-es", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True, use_ampm=True), "las diecinueve cuarenta") self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True, use_ampm=False), "las diecinueve cuarenta") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True), "la una quince") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las dos menos veinticinco") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las dos menos cuarto") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las cinco menos diez") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es"), "las seis menos cinco") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es", use_ampm=True), "las cinco y media de la madrugada") dt = datetime.datetime(2017, 1, 31, 23, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="es-es", use_24hour=True, use_ampm=True), "las veintitrés quince") self.assertEqual(nice_time(dt, lang="es-es", use_24hour=False, use_ampm=True), "las once y cuarto de la noche") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_eu.py000066400000000000000000000432371426211343400224130ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.time import now_local from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_relative_time from lingua_franca.format import pronounce_number # https://www.euskaltzaindia.eus/index.php?&option=com_ebe&view=bilaketa&Itemid=1161&task=bilaketa&lang=eu&id=1392 def setUpModule(): load_language('eu') set_default_lang('eu') def tearDownModule(): unload_language('eu') NUMBERS_FIXTURE_EU = { 1.435634: '1,436', 2: '2', 5.0: '5', 0.027: '0,027', 0.5: 'erdi bat', 1.333: '1 eta heren bat', 2.666: '2 eta 2 heren', 0.25: 'laurden bat', 1.25: '1 eta laurden bat', 0.75: '3 laurden', 1.75: '1 eta 3 laurden', 3.4: '3 eta 2 bosten', 16.8333: '16 eta 5 seiren', 12.5714: '12 eta 4 zazpiren', 9.625: '9 eta 5 zortziren', 6.777: '6 eta 7 bederatziren', 3.1: '3 eta hamarren bat', 2.272: '2 eta 3 hamaikaren', 5.583: '5 eta 7 hamabiren', 8.384: '8 eta 5 hamahiruren', 0.071: 'hamalauren bat', 6.466: '6 eta 7 hamabosten', 8.312: '8 eta 5 hamaseiren', 2.176: '2 eta 3 hamazazpiren', 200.722: '200 eta 13 hemezortziren', 7.421: '7 eta 8 hemeretziren', 0.05: 'hogeiren bat' } class TestNiceNumberFormat_eu(unittest.TestCase): def test_convert_float_to_nice_number_eu(self): for number, number_str in NUMBERS_FIXTURE_EU.items(): self.assertEqual(nice_number(number, lang="eu-eu"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number( number, lang="eu-eu"))) def test_specify_denominator_eu(self): self.assertEqual(nice_number(5.5, lang="eu-eu", denominators=[1, 2, 3]), '5 eta erdi', 'should format 5.5 as 5 eta erdi not {}'.format( nice_number(5.5, lang="eu-eu", denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="eu-eu", denominators=[1, 2]), '2,333', 'should format 2.333 as 2,333 not {}'.format( nice_number(2.333, lang="eu-eu", denominators=[1, 2]))) def test_no_speech_eu(self): self.assertEqual(nice_number(6.777, lang="eu-eu", speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="eu-eu", speech=False))) self.assertEqual(nice_number(6.0, lang="eu-eu", speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="eu-eu", speech=False))) self.assertEqual(nice_number(1234567890, lang="eu-eu", speech=False), '1 234 567 890', 'should format 1234567890 as' '1 234 567 890 not {}'.format( nice_number(1234567890, lang="eu-eu", speech=False))) self.assertEqual(nice_number(12345.6789, lang="eu-eu", speech=False), '12 345,679', 'should format 12345.6789 as' '12 345,679 not {}'.format( nice_number(12345.6789, lang="eu-eu", speech=False))) # https://www.euskaltzaindia.eus/dok/arauak/Araua_0007.pdf class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): # self.assertEqual(pronounce_number(0, lang="eu"), "zero") self.assertEqual(pronounce_number(1, lang="eu"), "bat") self.assertEqual(pronounce_number(10, lang="eu"), "hamar") self.assertEqual(pronounce_number(15, lang="eu"), "hamabost") self.assertEqual(pronounce_number(21, lang="eu"), "hogeita bat") self.assertEqual(pronounce_number(27, lang="eu"), "hogeita zazpi") self.assertEqual(pronounce_number(30, lang="eu"), "hogeita hamar") self.assertEqual(pronounce_number(19, lang="eu"), "hemeretzi") self.assertEqual(pronounce_number(88, lang="eu"), "laurogeita zortzi") self.assertEqual(pronounce_number(46, lang="eu"), "berrogeita sei") self.assertEqual(pronounce_number(99, lang="eu"), "laurogeita hemeretzi") self.assertEqual(pronounce_number(399, lang="eu"), "hirurehun eta laurogeita hemeretzi") self.assertEqual(pronounce_number(1200, lang="eu"), "mila eta berrehun") self.assertEqual(pronounce_number(1202, lang="eu"), "mila berrehun eta bi") self.assertEqual(pronounce_number(1359, lang="eu"), "mila hirurehun eta berrogeita hemeretzi") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1, lang="eu"), "minus bat") self.assertEqual(pronounce_number(-10, lang="eu"), "minus hamar") self.assertEqual(pronounce_number(-15, lang="eu"), "minus hamabost") self.assertEqual(pronounce_number(-21, lang="eu"), "minus hogeita bat") self.assertEqual(pronounce_number(-27, lang="eu"), "minus hogeita zazpi") self.assertEqual(pronounce_number(-30, lang="eu"), "minus hogeita hamar") self.assertEqual(pronounce_number(-35, lang="eu"), "minus hogeita hamabost") self.assertEqual(pronounce_number(-83, lang="eu"), "minus laurogeita hiru") self.assertEqual(pronounce_number(-19, lang="eu"), "minus hemeretzi") self.assertEqual(pronounce_number(-88, lang="eu"), "minus laurogeita zortzi") self.assertEqual(pronounce_number(-46, lang="eu"), "minus berrogeita sei") self.assertEqual(pronounce_number(-99, lang="eu"), "minus laurogeita hemeretzi") def test_convert_decimals(self): self.assertEqual(pronounce_number( 0.05, lang="eu"), "zero koma zero bost") self.assertEqual(pronounce_number( -0.05, lang="eu"), "minus zero koma zero bost") self.assertEqual(pronounce_number(1.234, lang="eu"), "bat koma bi hiru") self.assertEqual(pronounce_number(21.234, lang="eu"), "hogeita bat koma bi hiru") self.assertEqual(pronounce_number(21.234, lang="eu", places=1), "hogeita bat koma bi") self.assertEqual(pronounce_number(21.234, lang="eu", places=0), "hogeita bat") self.assertEqual(pronounce_number(21.234, lang="eu", places=3), "hogeita bat koma bi hiru lau") self.assertEqual(pronounce_number(21.234, lang="eu", places=4), "hogeita bat koma bi hiru lau") self.assertEqual(pronounce_number(21.234, lang="eu", places=5), "hogeita bat koma bi hiru lau") self.assertEqual(pronounce_number(-21.234, lang="eu"), "minus hogeita bat koma bi hiru") self.assertEqual(pronounce_number(-21.234, lang="eu", places=1), "minus hogeita bat koma bi") self.assertEqual(pronounce_number(-21.234, lang="eu", places=0), "minus hogeita bat") self.assertEqual(pronounce_number(-21.234, lang="eu", places=3), "minus hogeita bat koma bi hiru lau") self.assertEqual(pronounce_number(-21.234, lang="eu", places=4), "minus hogeita bat koma bi hiru lau") self.assertEqual(pronounce_number(-21.234, lang="eu", places=5), "minus hogeita bat koma bi hiru lau") class TestNiceDateFormat(unittest.TestCase): def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3) # Verify defaults haven't changed self.assertEqual(nice_time(dt, lang="eu-eu"), nice_time(dt, "eu-eu", True, False, False)) self.assertEqual(nice_time(dt, lang="eu"), "ordubata eta hogeita bi") self.assertEqual(nice_time(dt, lang="eu", use_ampm=True), "arratsaldeko ordubata eta hogeita bi") self.assertEqual(nice_time(dt, lang="eu", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=True), "hamahiruak hogeita bi") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=False), "hamahiruak hogeita bi") dt = datetime.datetime(2017, 1, 31, 13, 0, 3) self.assertEqual(nice_time(dt, lang="eu"), "ordubata puntuan") self.assertEqual(nice_time(dt, lang="eu", use_ampm=True), "arratsaldeko ordubata") self.assertEqual(nice_time(dt, lang="eu", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=True), "hamahiruak zero zero") dt = datetime.datetime(2017, 1, 31, 13, 2, 3) self.assertEqual(nice_time(dt, lang="eu", use_24hour=True), "hamahiruak zero bi") self.assertEqual(nice_time(dt, lang="eu", use_ampm=True), "arratsaldeko ordubata eta bi") self.assertEqual(nice_time(dt, lang="eu", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=True), "hamahiruak zero bi") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=False), "hamahiruak zero bi") dt = datetime.datetime(2017, 1, 31, 0, 2, 3) self.assertEqual(nice_time(dt, lang="eu"), "hamabiak eta bi") self.assertEqual(nice_time(dt, lang="eu", use_ampm=True), "gaueko hamabiak eta bi") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True), "zeroak zero bi") self.assertEqual(nice_time(dt, lang="eu", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="eu", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=True), "zeroak zero bi") self.assertEqual(nice_time(dt, lang="eu", use_24hour=True, use_ampm=False), "zeroak zero bi") dt = datetime.datetime(2017, 1, 31, 12, 15, 9) self.assertEqual(nice_time(dt, lang="eu-eu"), "hamabiak eta laurden") self.assertEqual(nice_time(dt, lang="eu-eu", use_ampm=True), "goizeko hamabiak eta laurden") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True, use_ampm=True), "hamabiak hamabost") self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True, use_ampm=False), "hamabiak hamabost") dt = datetime.datetime(2017, 1, 31, 19, 40, 49) self.assertEqual(nice_time(dt, lang="eu-eu"), "zortzirak hogei gutxi") self.assertEqual(nice_time(dt, lang="eu-eu", use_ampm=True), "arratsaldeko zortzirak hogei gutxi") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="eu-eu", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True, use_ampm=True), "hemeretziak berrogei") self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True, use_ampm=False), "hemeretziak berrogei") dt = datetime.datetime(2017, 1, 31, 1, 15, 00) self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True), "batak hamabost") dt = datetime.datetime(2017, 1, 31, 1, 35, 00) self.assertEqual(nice_time(dt, lang="eu-eu"), "ordubiak hogeita bost gutxi") dt = datetime.datetime(2017, 1, 31, 1, 45, 00) self.assertEqual(nice_time(dt, lang="eu-eu"), "ordubiak laurden gutxi") dt = datetime.datetime(2017, 1, 31, 4, 50, 00) self.assertEqual(nice_time(dt, lang="eu-eu"), "bostak hamar gutxi") dt = datetime.datetime(2017, 1, 31, 5, 55, 00) self.assertEqual(nice_time(dt, lang="eu-eu"), "seirak bost gutxi") dt = datetime.datetime(2017, 1, 31, 5, 30, 00) self.assertEqual(nice_time(dt, lang="eu-eu", use_ampm=True), "gaueko bostak eta erdi") dt = datetime.datetime(2017, 1, 31, 23, 15, 9) self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=True, use_ampm=True), "hogeita hiruak hamabost") self.assertEqual(nice_time(dt, lang="eu-eu", use_24hour=False, use_ampm=True), "gaueko hamaikak eta laurden") class TestNiceRelativeTime(unittest.TestCase): def test_format_nice_relative_time(self): now = now_local() two_hours_from_now = now + datetime.timedelta(hours=2) self.assertEqual( nice_relative_time(when=two_hours_from_now, relative_to=now), "2 ordu" ) seconds_from_now = now + datetime.timedelta(seconds=47) self.assertEqual( nice_relative_time(when=seconds_from_now, relative_to=now), "47 segundo" ) days_from_now = now + datetime.timedelta(days=3) self.assertEqual( nice_relative_time(when=days_from_now, relative_to=now), "3 egun" ) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_fa.py000066400000000000000000000455321426211343400223700ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import unittest import datetime import ast import warnings import sys from pathlib import Path # TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES, # or make it public somehow from lingua_franca import load_languages, unload_languages, set_default_lang, \ get_primary_lang_code, get_active_langs, get_supported_langs from lingua_franca.internal import UnsupportedLanguageError from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time from lingua_franca.format import nice_year from lingua_franca.format import nice_duration from lingua_franca.format import pronounce_number from lingua_franca.format import date_time_format from lingua_franca.format import join_list def setUpModule(): load_languages(get_supported_langs()) # TODO spin English tests off into another file, like other languages, so we # don't have to do this confusing thing in the "master" test_format.py set_default_lang('fa-ir') def tearDownModule(): unload_languages(get_active_langs()) NUMBERS_FIXTURE_EN = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: 'یک دوم', 1.333: '1 و یک سوم', 2.666: '2 و 2 سوم', 0.25: 'یک چهارم', 1.25: '1 و یک چهارم', 0.75: '3 چهارم', 1.75: '1 و 3 چهارم', 3.4: '3 و 2 پنجم', 16.8333: '16 و 5 ششم', 12.5714: '12 و 4 هفتم', 9.625: '9 و 5 هشتم', 6.777: '6 و 7 نهم', 3.1: '3 و یک دهم', 2.272: '2 و 3 یازدهم', 5.583: '5 و 7 دوازدهم', 8.384: '8 و 5 سیزدهم', 0.071: 'یک چهاردهم', 6.466: '6 و 7 پونزدهم', 8.312: '8 و 5 شونزدهم', 2.176: '2 و 3 هیفدهم', 200.722: '200 و 13 هیجدهم', 7.421: '7 و 8 نوزدهم', 0.05: 'یک بیستم' } class TestNiceNumberFormat(unittest.TestCase): tmp_var = None def set_tmp_var(self, val): self.tmp_var = val def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_EN.items(): self.assertEqual(nice_number(number), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), '5 و یک دوم', 'should format 5.5 as 5 and a half not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), '2.333', 'should format 2.333 as 2.333 not {}'.format( nice_number(2.333, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "صفر") self.assertEqual(pronounce_number(1), "یک") self.assertEqual(pronounce_number(10), "ده") self.assertEqual(pronounce_number(15), "پونزده") self.assertEqual(pronounce_number(20), "بیست") self.assertEqual(pronounce_number(27), "بیست و هفت") self.assertEqual(pronounce_number(30), "سی") self.assertEqual(pronounce_number(33), "سی و سه") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "منفی یک") self.assertEqual(pronounce_number(-10), "منفی ده") self.assertEqual(pronounce_number(-15), "منفی پونزده") self.assertEqual(pronounce_number(-20), "منفی بیست") self.assertEqual(pronounce_number(-27), "منفی بیست و هفت") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "پنج صدم") self.assertEqual(pronounce_number(-0.05), "منفی پنج صدم") self.assertEqual(pronounce_number(1.234), "یک و بیست و سه صدم") self.assertEqual(pronounce_number(21.234), "بیست و یک و بیست و سه صدم") self.assertEqual(pronounce_number(21.234, places=1), "بیست و یک و دو دهم") self.assertEqual(pronounce_number(21.234, places=0), "بیست و یک") self.assertEqual(pronounce_number(21.234, places=3), "بیست و یک و دویست و سی و چهار هزارم") self.assertEqual(pronounce_number(21.234, places=4), "بیست و یک و دویست و سی و چهار هزارم") self.assertEqual(pronounce_number(21.234, places=5), "بیست و یک و دویست و سی و چهار هزارم") self.assertEqual(pronounce_number(-1.234), "منفی یک و بیست و سه صدم") self.assertEqual(pronounce_number(-21.234), "منفی بیست و یک و بیست و سه صدم") self.assertEqual(pronounce_number(-21.234, places=1), "منفی بیست و یک و دو دهم") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100), "صد") self.assertEqual(pronounce_number(666), "ششصد و شصت و شش") self.assertEqual(pronounce_number(1456), "هزار و چهارصد و پنجاه و شش") self.assertEqual(pronounce_number(103254654), "صد و سه میلیون و " "دویست و پنجاه و چهار " "هزار و ششصد و پنجاه و چهار") self.assertEqual(pronounce_number(1512457), "یک میلیون و پانصد و دوازده هزار" " و چهارصد و پنجاه و هفت") self.assertEqual(pronounce_number(209996), "دویست و نه هزار و نهصد و نود و شش") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "صفر") self.assertEqual(pronounce_number(33, scientific=True), "سه و سه دهم ضرب در ده به توان یک") self.assertEqual(pronounce_number(299792458, scientific=True), "دو و نود و نه صدم ضرب در ده به توان هشت") self.assertEqual(pronounce_number(299792448, places=6, scientific=True), "دو و نهصد و نود و هفت هزار و نهصد و بیست و چهار میلیونیم ضرب در ده به توان هشت") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "یک و ششصد و هفتاد و دو هزارم ضرب در ده به توان منفی بیست و هفت") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "یکم") self.assertEqual(pronounce_number(10, ordinals=True), "دهم") self.assertEqual(pronounce_number(15, ordinals=True), "پونزدهم") self.assertEqual(pronounce_number(20, ordinals=True), "بیستم") self.assertEqual(pronounce_number(27, ordinals=True), "بیست و هفتم") self.assertEqual(pronounce_number(30, ordinals=True), "سیم") self.assertEqual(pronounce_number(33, ordinals=True), "سی و سوم") self.assertEqual(pronounce_number(100, ordinals=True), "صدم") self.assertEqual(pronounce_number(1000, ordinals=True), "هزارم") self.assertEqual(pronounce_number(10000, ordinals=True), "ده هزارم") self.assertEqual(pronounce_number(18691, ordinals=True), "هیجده هزار و ششصد و نود و یکم") self.assertEqual(pronounce_number(1567, ordinals=True), "هزار و پانصد و شصت و هفتم") self.assertEqual(pronounce_number(18e6, ordinals=True), "هیجده میلیونم") self.assertEqual(pronounce_number(18e9, ordinals=True), "هیجده میلیاردم") def test_variant(self): self.assertEqual(pronounce_number(18691, ordinals=True, variant="formal"), "هجده هزار و ششصد و نود و یکم") self.assertEqual(pronounce_number(15, variant='conversational'), "پونزده") self.assertEqual(pronounce_number(15, variant='formal'), "پانزده") self.assertEqual(nice_number(2.176, variant='formal'), "2 و 3 هفدهم") dt = datetime.datetime(2017, 1, 31, 16, 22, 3) self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='formal'), "شانزده و بیست و دو دقیقه") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='conversational'), "شونزده و بیست و دو دقیقه") # def nice_time(dt, lang="en-us", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat(unittest.TestCase): @classmethod def setUpClass(cls): # Read date_time_test.json files for test data cls.test_config = {} p = Path(date_time_format.config_path) for sub_dir in [x for x in p.iterdir() if x.is_dir()]: if (sub_dir / 'date_time_test.json').exists(): print("Getting test for " + str(sub_dir / 'date_time_test.json')) with (sub_dir / 'date_time_test.json').open() as f: cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3) # Verify defaults haven't changed self.assertEqual(nice_time(dt), nice_time(dt, "fa-ir", True, False, False)) self.assertEqual(nice_time(dt), "یک و بیست و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), "یک و بیست و دو دقیقه بعد از ظهر") self.assertEqual(nice_time(dt, speech=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "سیزده و بیست و دو دقیقه") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "سیزده و بیست و دو دقیقه") dt = datetime.datetime(2017, 1, 31, 13, 0, 3) self.assertEqual(nice_time(dt), "یک") self.assertEqual(nice_time(dt, use_ampm=True), "یک بعد از ظهر") self.assertEqual(nice_time(dt, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "سیزده") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "سیزده") dt = datetime.datetime(2017, 1, 31, 13, 2, 3) self.assertEqual(nice_time(dt), "یک و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), "یک و دو دقیقه بعد از ظهر") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "سیزده و دو دقیقه") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "سیزده و دو دقیقه") dt = datetime.datetime(2017, 1, 31, 0, 2, 3) self.assertEqual(nice_time(dt), "دوازده و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), "دوازده و دو دقیقه قبل از ظهر") self.assertEqual(nice_time(dt, speech=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "صفر و دو دقیقه") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "صفر و دو دقیقه") dt = datetime.datetime(2018, 2, 8, 1, 2, 33) self.assertEqual(nice_time(dt), "یک و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), "یک و دو دقیقه قبل از ظهر") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "یک و دو دقیقه") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "یک و دو دقیقه") dt = datetime.datetime(2017, 1, 31, 12, 15, 9) self.assertEqual(nice_time(dt), "دوازده و ربع") self.assertEqual(nice_time(dt, use_ampm=True), "دوازده و ربع بعد از ظهر") dt = datetime.datetime(2017, 1, 31, 5, 30, 00) self.assertEqual(nice_time(dt, use_ampm=True), "پنج و نیم قبل از ظهر") dt = datetime.datetime(2017, 1, 31, 1, 45, 00) self.assertEqual(nice_time(dt), "یه ربع به دو") # TODO: failed because of و #def test_nice_duration(self): # self.assertEqual(nice_duration(1), "یک ثانیه") # self.assertEqual(nice_duration(3), "سه ثانیه") # self.assertEqual(nice_duration(1, speech=False), "0:01") # self.assertEqual(nice_duration(61), "یک دقیقه و یک ثانیه") # self.assertEqual(nice_duration(61, speech=False), "1:01") # self.assertEqual(nice_duration(5000), # "یک ساعت و بیست و سه دقیقه و بیست ثانیه") # self.assertEqual(nice_duration(5000, speech=False), "1:23:20") # self.assertEqual(nice_duration(50000), # "سیزده ساعت و پنجاه و سه دقیقه و بیست ثانیه") # self.assertEqual(nice_duration(50000, speech=False), "13:53:20") # self.assertEqual(nice_duration(500000), # "پنج روز و هیجده ساعت و پنجاه و سه دقیقه و بیست ثانیه") # nopep8 # self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") # self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), # speech=False), # "5d 18:53:20") def test_join(self): self.assertEqual(join_list(None, "and"), "") self.assertEqual(join_list([], "and"), "") self.assertEqual(join_list(["الف"], "و"), "الف") self.assertEqual(join_list(["الف", "ب"], "و"), "الف و ب") self.assertEqual(join_list(["الف", "ب"], "یا"), "الف یا ب") self.assertEqual(join_list(["الف", "ب", "ج"], "و"), "الف, ب و ج") self.assertEqual(join_list(["الف", "ب", "ج"], "یا"), "الف, ب یا ج") self.assertEqual(join_list(["الف", "ب", "ج"], "یا", ";"), "الف; ب یا ج") self.assertEqual(join_list(["الف", "ب", "ج", "دال"], "یا"), "الف, ب, ج یا دال") self.assertEqual(join_list([1, "ب", 3, "دال"], "یا"), "1, ب, 3 یا دال") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_fr.py000066400000000000000000000421711426211343400224050ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language('fr-fr') set_default_lang('fr') def tearDownModule(): unload_language('fr') NUMBERS_FIXTURE_FR = { 1.435634: '1,436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345,679', 0.027: '0,027', 0.5: 'un demi', 1.333: '1 et 1 tiers', 2.666: '2 et 2 tiers', 0.25: 'un quart', 1.25: '1 et 1 quart', 0.75: '3 quarts', 1.75: '1 et 3 quarts', 3.4: '3 et 2 cinquièmes', 16.8333: '16 et 5 sixièmes', 12.5714: '12 et 4 septièmes', 9.625: '9 et 5 huitièmes', 6.777: '6 et 7 neuvièmes', 3.1: '3 et 1 dixième', 2.272: '2 et 3 onzièmes', 5.583: '5 et 7 douzièmes', 8.384: '8 et 5 treizièmes', 0.071: 'un quatorzième', 6.466: '6 et 7 quinzièmes', 8.312: '8 et 5 seizièmes', 2.176: '2 et 3 dix-septièmes', 200.722: '200 et 13 dix-huitièmes', 7.421: '7 et 8 dix-neuvièmes', 0.05: 'un vingtième' } class TestNiceNumberFormat_fr(unittest.TestCase): def test_convert_float_to_nice_number_fr(self): for number, number_str in NUMBERS_FIXTURE_FR.items(): self.assertEqual(nice_number(number, lang="fr-fr"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number( number, lang="fr-fr"))) def test_specify_denominator_fr(self): self.assertEqual(nice_number(5.5, lang="fr-fr", denominators=[1, 2, 3]), '5 et demi', 'should format 5.5 as 5 et demi not {}'.format( nice_number(5.5, lang="fr-fr", denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="fr-fr", denominators=[1, 2]), '2,333', 'should format 2.333 as 2,333 not {}'.format( nice_number(2.333, lang="fr-fr", denominators=[1, 2]))) def test_no_speech_fr(self): self.assertEqual(nice_number(6.777, lang="fr-fr", speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="fr-fr", speech=False))) self.assertEqual(nice_number(6.0, lang="fr-fr", speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="fr-fr", speech=False))) self.assertEqual(nice_number(1234567890, lang="fr-fr", speech=False), '1 234 567 890', 'should format 1234567890 as' '1 234 567 890 not {}'.format( nice_number(1234567890, lang="fr-fr", speech=False))) self.assertEqual(nice_number(12345.6789, lang="fr-fr", speech=False), '12 345,679', 'should format 12345.6789 as' '12 345,679 not {}'.format( nice_number(12345.6789, lang="fr-fr", speech=False))) # def pronounce_number(number, lang="en-us", places=2): class TestPronounceNumber_fr(unittest.TestCase): def test_convert_int_fr(self): self.assertEqual(pronounce_number(0, lang="fr-fr"), "zéro") self.assertEqual(pronounce_number(1, lang="fr-fr"), "un") self.assertEqual(pronounce_number(10, lang="fr-fr"), "dix") self.assertEqual(pronounce_number(15, lang="fr-fr"), "quinze") self.assertEqual(pronounce_number(20, lang="fr-fr"), "vingt") self.assertEqual(pronounce_number(27, lang="fr-fr"), "vingt-sept") self.assertEqual(pronounce_number(30, lang="fr-fr"), "trente") self.assertEqual(pronounce_number(33, lang="fr-fr"), "trente-trois") self.assertEqual(pronounce_number(71, lang="fr-fr"), "soixante-et-onze") self.assertEqual(pronounce_number(80, lang="fr-fr"), "quatre-vingts") self.assertEqual(pronounce_number(74, lang="fr-fr"), "soixante-quatorze") self.assertEqual(pronounce_number(79, lang="fr-fr"), "soixante-dix-neuf") self.assertEqual(pronounce_number(91, lang="fr-fr"), "quatre-vingt-onze") self.assertEqual(pronounce_number(97, lang="fr-fr"), "quatre-vingt-dix-sept") self.assertEqual(pronounce_number(300, lang="fr-fr"), "300") def test_convert_negative_int_fr(self): self.assertEqual(pronounce_number(-1, lang="fr-fr"), "moins un") self.assertEqual(pronounce_number(-10, lang="fr-fr"), "moins dix") self.assertEqual(pronounce_number(-15, lang="fr-fr"), "moins quinze") self.assertEqual(pronounce_number(-20, lang="fr-fr"), "moins vingt") self.assertEqual(pronounce_number(-27, lang="fr-fr"), "moins vingt-sept") self.assertEqual(pronounce_number(-30, lang="fr-fr"), "moins trente") self.assertEqual(pronounce_number(-33, lang="fr-fr"), "moins trente-trois") def test_convert_decimals_fr(self): self.assertEqual(pronounce_number(0.05, lang="fr-fr"), "zéro virgule zéro cinq") self.assertEqual(pronounce_number(-0.05, lang="fr-fr"), "moins zéro virgule zéro cinq") self.assertEqual(pronounce_number(1.234, lang="fr-fr"), "un virgule deux trois") self.assertEqual(pronounce_number(21.234, lang="fr-fr"), "vingt-et-un virgule deux trois") self.assertEqual(pronounce_number(21.234, lang="fr-fr", places=1), "vingt-et-un virgule deux") self.assertEqual(pronounce_number(21.234, lang="fr-fr", places=0), "vingt-et-un") self.assertEqual(pronounce_number(21.234, lang="fr-fr", places=3), "vingt-et-un virgule deux trois quatre") self.assertEqual(pronounce_number(21.234, lang="fr-fr", places=4), "vingt-et-un virgule deux trois quatre") self.assertEqual(pronounce_number(21.234, lang="fr-fr", places=5), "vingt-et-un virgule deux trois quatre") self.assertEqual(pronounce_number(-1.234, lang="fr-fr"), "moins un virgule deux trois") self.assertEqual(pronounce_number(-21.234, lang="fr-fr"), "moins vingt-et-un virgule deux trois") self.assertEqual(pronounce_number(-21.234, lang="fr-fr", places=1), "moins vingt-et-un virgule deux") self.assertEqual(pronounce_number(-21.234, lang="fr-fr", places=0), "moins vingt-et-un") self.assertEqual(pronounce_number(-21.234, lang="fr-fr", places=3), "moins vingt-et-un virgule deux trois quatre") self.assertEqual(pronounce_number(-21.234, lang="fr-fr", places=4), "moins vingt-et-un virgule deux trois quatre") self.assertEqual(pronounce_number(-21.234, lang="fr-fr", places=5), "moins vingt-et-un virgule deux trois quatre") # def nice_time(dt, lang="en-us", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_fr(unittest.TestCase): def test_convert_times_fr(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "une heure vingt-deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "une heure vingt-deux de l'après-midi") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "treize heures vingt-deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "treize heures vingt-deux") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "une heure") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "une heure de l'après-midi") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "treize heures") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "treize heures") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "une heure deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "une heure deux de l'après-midi") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "treize heures deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "treize heures deux") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "minuit deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "minuit deux") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "minuit deux") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "minuit deux") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "midi et quart") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "midi et quart") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "midi quinze") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "midi quinze") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "huit heures moins vingt") self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "huit heures moins vingt du soir") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="fr-fr", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=True), "dix-neuf heures quarante") self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True, use_ampm=False), "dix-neuf heures quarante") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr", use_24hour=True), "une heure quinze") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "deux heures moins vingt-cinq") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "deux heures moins le quart") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "cinq heures moins dix") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr"), "six heures moins cinq") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="fr-fr", use_ampm=True), "cinq heures et demi du matin") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_hu.py000066400000000000000000000442671426211343400224220ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.lang.format_hu import pronounce_ordinal_hu from lingua_franca.time import default_timezone def setUpModule(): load_language('hu-hu') set_default_lang('hu') def tearDownModule(): unload_language('hu') # fractions are not capitalized for now NUMBERS_FIXTURE_HU = { 1.435634: '1,436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345,679', 0.027: '0,027', 0.5: 'fél', 1.333: '1 egész egy harmad', 2.666: '2 egész 2 harmad', 0.25: 'egy negyed', 1.25: '1 egész egy negyed', 0.75: '3 negyed', 1.75: '1 egész 3 negyed', 3.4: '3 egész 2 ötöd', 16.8333: '16 egész 5 hatod', 12.5714: '12 egész 4 heted', 9.625: '9 egész 5 nyolcad', 6.777: '6 egész 7 kilenced', 3.1: '3 egész egy tized', 2.272: '2 egész 3 tizenegyed', 5.583: '5 egész 7 tizenketted', 8.384: '8 egész 5 tizenharmad', 0.071: 'egy tizennegyed', 6.466: '6 egész 7 tizenötöd', 8.312: '8 egész 5 tizenhatod', 2.176: '2 egész 3 tizenheted', 200.722: '200 egész 13 tizennyolcad', 7.421: '7 egész 8 tizenkilenced', 0.05: 'egy huszad' } class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_HU.items(): self.assertEqual(nice_number(number, lang="hu-hu"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number, lang="hu-hu"))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, lang="hu-hu", denominators=[1, 2, 3]), '5 és fél', 'should format 5.5 as 5 és fél not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="hu-hu", denominators=[1, 2]), '2,333', 'should format 2,333 as 2,333 not {}'.format( nice_number(2.333, lang="hu-hu", denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="hu-hu", speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="hu-hu", speech=False))) class TestPronounceOrdinal(unittest.TestCase): def test_convert_int_hu(self): self.assertEqual(pronounce_ordinal_hu(0), "nulladik") self.assertEqual(pronounce_ordinal_hu(1), "első") self.assertEqual(pronounce_ordinal_hu(3), "harmadik") self.assertEqual(pronounce_ordinal_hu(5), "ötödik") self.assertEqual(pronounce_ordinal_hu(15), "tizenötödik") self.assertEqual(pronounce_ordinal_hu(25), "huszonötödik") self.assertEqual(pronounce_ordinal_hu(1000), "ezredik") self.assertEqual(pronounce_ordinal_hu(60), "hatvanadik") self.assertEqual(pronounce_ordinal_hu(1266), "ezerkétszázhatvanhatodik") self.assertEqual(pronounce_ordinal_hu(101), "százegyedik") self.assertEqual(pronounce_ordinal_hu(123456), "százhuszonháromezer-négyszázötvenhatodik") self.assertEqual(pronounce_ordinal_hu(8000000), "nyolcmilliomodik") # def pronounce_number(number, lang="hu-hu", places=2): class TestPronounceNumber(unittest.TestCase): def test_convert_int_hu(self): self.assertEqual(pronounce_number(123456789123456789, lang="hu-hu"), "százhuszonhárombilliárd-" "négyszázötvenhatbillió-" "hétszáznyolcvankilencmilliárd-" "százhuszonhárommillió-" "négyszázötvenhatezer-" "hétszáznyolcvankilenc") self.assertEqual(pronounce_number(1, lang="hu-hu"), "egy") self.assertEqual(pronounce_number(10, lang="hu-hu"), "tíz") self.assertEqual(pronounce_number(15, lang="hu-hu"), "tizenöt") self.assertEqual(pronounce_number(20, lang="hu-hu"), "húsz") self.assertEqual(pronounce_number(27, lang="hu-hu"), "huszonhét") self.assertEqual(pronounce_number(30, lang="hu-hu"), "harminc") self.assertEqual(pronounce_number(33, lang="hu-hu"), "harminchárom") self.assertEqual(pronounce_number(71, lang="hu-hu"), "hetvenegy") self.assertEqual(pronounce_number(80, lang="hu-hu"), "nyolcvan") self.assertEqual(pronounce_number(74, lang="hu-hu"), "hetvennégy") self.assertEqual(pronounce_number(79, lang="hu-hu"), "hetvenkilenc") self.assertEqual(pronounce_number(91, lang="hu-hu"), "kilencvenegy") self.assertEqual(pronounce_number(97, lang="hu-hu"), "kilencvenhét") self.assertEqual(pronounce_number(300, lang="hu-hu"), "háromszáz") self.assertEqual(pronounce_number(1905, lang="hu-hu"), "ezerkilencszázöt") self.assertEqual(pronounce_number(2001, lang="hu-hu"), "kétezer-egy") def test_convert_negative_int_hu(self): self.assertEqual(pronounce_number(-1, lang="hu-hu"), "mínusz egy") self.assertEqual(pronounce_number(-10, lang="hu-hu"), "mínusz tíz") self.assertEqual(pronounce_number(-15, lang="hu-hu"), "mínusz tizenöt") self.assertEqual(pronounce_number(-20, lang="hu-hu"), "mínusz húsz") self.assertEqual(pronounce_number(-27, lang="hu-hu"), "mínusz huszonhét") self.assertEqual(pronounce_number(-30, lang="hu-hu"), "mínusz harminc") self.assertEqual(pronounce_number(-33, lang="hu-hu"), "mínusz harminchárom") def test_convert_decimals_hu(self): self.assertEqual(pronounce_number(1.234, lang="hu-hu"), "egy egész huszonhárom század") self.assertEqual(pronounce_number(21.234, lang="hu-hu"), "huszonegy egész huszonhárom század") self.assertEqual(pronounce_number(21.234, lang="hu-hu", places=1), "huszonegy egész két tized") self.assertEqual(pronounce_number(21.234, lang="hu-hu", places=0), "huszonegy") self.assertEqual(pronounce_number(21.234, lang="hu-hu", places=3), "huszonegy egész kétszázharmincnégy ezred") self.assertEqual(pronounce_number(21.234, lang="hu-hu", places=4), "huszonegy egész kétezer-háromszáznegyven tízezred") self.assertEqual(pronounce_number(21.234, lang="hu-hu", places=5), "huszonegy egész huszonháromezer-négyszáz százezred") self.assertEqual(pronounce_number(-1.234, lang="hu-hu"), "mínusz egy egész huszonhárom század") self.assertEqual(pronounce_number(-21.234, lang="hu-hu"), "mínusz huszonegy egész huszonhárom század") self.assertEqual(pronounce_number(-21.234, lang="hu-hu", places=1), "mínusz huszonegy egész két tized") self.assertEqual(pronounce_number(-21.234, lang="hu-hu", places=0), "mínusz huszonegy") self.assertEqual(pronounce_number(-21.234, lang="hu-hu", places=3), "mínusz huszonegy egész kétszázharmincnégy ezred") self.assertEqual(pronounce_number(-21.234, lang="hu-hu", places=4), "mínusz huszonegy egész " "kétezer-háromszáznegyven tízezred") self.assertEqual(pronounce_number(-21.234, lang="hu-hu", places=5), "mínusz huszonegy egész " "huszonháromezer-négyszáz százezred") # def nice_time(dt, lang="hu-hu", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_hu(unittest.TestCase): def test_convert_times_hu(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "egy óra huszonkettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "délután egy óra huszonkettő") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "tizenhárom óra huszonkettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "tizenhárom óra huszonkettő") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "egy óra") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "délután egy óra") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "tizenhárom óra") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "tizenhárom óra") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "egy óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "délután egy óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "tizenhárom óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "tizenhárom óra kettő") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "tizenkét óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "éjjel tizenkét óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "nulla óra kettő") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "nulla óra kettő") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "tizenkét óra tizenöt") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "délután tizenkét óra tizenöt") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "tizenkét óra tizenöt") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "tizenkét óra tizenöt") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "hét óra negyven") self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "este hét óra negyven") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="hu-hu", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=True), "tizenkilenc óra negyven") self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True, use_ampm=False), "tizenkilenc óra negyven") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu", use_24hour=True), "egy óra tizenöt") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "egy óra harmincöt") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "egy óra negyvenöt") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "négy óra ötven") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu"), "öt óra ötvenöt") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="hu-hu", use_ampm=True), "reggel öt óra harminc") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_it.py000066400000000000000000000455771426211343400224270ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime import sys from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language('it-it') set_default_lang('it') def tearDownModule(): unload_language('it') NUMBERS_FIXTURE_IT = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: 'un mezzo', 1.333: '1 e un terzo', 2.666: '2 e 2 terzi', 0.25: 'un quarto', 1.25: '1 e un quarto', 0.75: '3 quarti', 1.75: '1 e 3 quarti', 3.4: '3 e 2 quinti', 16.8333: '16 e 5 sesti', 12.5714: '12 e 4 settimi', 9.625: '9 e 5 ottavi', 6.777: '6 e 7 noni', 3.1: '3 e un decimo', 2.272: '2 e 3 undicesimi', 5.583: '5 e 7 dodicesimi', 8.384: '8 e 5 tredicesimi', 0.071: 'un quattordicesimo', 6.466: '6 e 7 quindicesimi', 8.312: '8 e 5 sedicesimi', 2.176: '2 e 3 diciassettesimi', 200.722: '200 e 13 diciottesimi', 7.421: '7 e 8 diciannovesimi', 0.05: 'un ventesimo' } class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number_it(self): for number, number_str in NUMBERS_FIXTURE_IT.items(): self.assertEqual(nice_number(number, lang='it'), number_str, 'dovrebbe formattare {} come {} e none {}'.format( number, number_str, nice_number( number, lang="it"))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3], lang="it"), '5 e un mezzo', 'dovrebbe dare 5.5 come 5 e un mezzo non {}'.format( nice_number(5.5, denominators=[1, 2, 3], lang="it"))) self.assertEqual(nice_number(2.333, denominators=[1, 2], lang="it"), '2.333', 'dovrebbe dare 2.333 come 2.333 non {}'.format( nice_number(2.333, denominators=[1, 2], lang="it"))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False, lang="it"), '6 7/9', 'dovrebbe formattare 6.777 come 6 7/9 non {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False, lang="it"), '6', 'dovrebbe formattare 6.0 come 6 non {}'.format( nice_number(6.0, speech=False))) # def pronounce_number(number, lang="it-it", places=2): class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0, lang="it"), "zero") self.assertEqual(pronounce_number(1, lang="it"), "uno") self.assertEqual(pronounce_number(10, lang="it"), "dieci") self.assertEqual(pronounce_number(15, lang="it"), "quindici") self.assertEqual(pronounce_number(21, lang="it"), "ventuno") self.assertEqual(pronounce_number(27, lang="it"), "ventisette") self.assertEqual(pronounce_number(30, lang="it"), "trenta") self.assertEqual(pronounce_number(83, lang="it"), "ottantatre") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1, lang="it"), "meno uno") self.assertEqual(pronounce_number(-10, lang="it"), "meno dieci") self.assertEqual(pronounce_number(-15, lang="it"), "meno quindici") self.assertEqual(pronounce_number(-21, lang="it"), "meno ventuno") self.assertEqual(pronounce_number(-27, lang="it"), "meno ventisette") self.assertEqual(pronounce_number(-30, lang="it"), "meno trenta") self.assertEqual(pronounce_number(-83, lang="it"), "meno ottantatre") def test_convert_decimals(self): self.assertEqual(pronounce_number( 0.05, lang="it"), "zero virgola zero cinque") self.assertEqual(pronounce_number( -0.05, lang="it"), "meno zero virgola zero cinque") self.assertEqual(pronounce_number(1.234, lang="it"), "uno virgola due tre") self.assertEqual(pronounce_number(21.234, lang="it"), "ventuno virgola due tre") self.assertEqual(pronounce_number(21.234, lang="it", places=1), "ventuno virgola due") self.assertEqual(pronounce_number(21.234, lang="it", places=0), "ventuno") self.assertEqual(pronounce_number(21.234, lang="it", places=3), "ventuno virgola due tre quattro") self.assertEqual(pronounce_number(21.234, lang="it", places=4), "ventuno virgola due tre quattro") self.assertEqual(pronounce_number(21.234, lang="it", places=5), "ventuno virgola due tre quattro") self.assertEqual(pronounce_number(-21.234, lang="it"), "meno ventuno virgola due tre") self.assertEqual(pronounce_number(-21.234, lang="it", places=1), "meno ventuno virgola due") self.assertEqual(pronounce_number(-21.234, lang="it", places=0), "meno ventuno") self.assertEqual(pronounce_number(-21.234, lang="it", places=3), "meno ventuno virgola due tre quattro") self.assertEqual(pronounce_number(-21.234, lang="it", places=4), "meno ventuno virgola due tre quattro") self.assertEqual(pronounce_number(-21.234, lang="it", places=5), "meno ventuno virgola due tre quattro") # TODO these tests seem to use short scale numbers, whereas the localized # formatter is configured to use the long scale. We need an Italian speaker # to write new tests. @unittest.skip("rewrite tests to use long scale") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100, lang="it"), "cento") self.assertEqual(pronounce_number(121, lang="it"), "cento ventuno") self.assertEqual(pronounce_number(121000, lang="it"), "cento ventunomila") self.assertEqual(pronounce_number(666, lang="it"), "seicento sessantasei") self.assertEqual(pronounce_number(1456, lang="it"), "mille, quattrocento cinquantasei") self.assertEqual(pronounce_number(103254654, lang="it"), "cento tremilioni, duecento " "cinquantaquattromila, seicento " "cinquantaquattro") self.assertEqual(pronounce_number(1512457, lang="it"), "un milione, cinquecento dodicimila, " "quattrocento cinquantasette") self.assertEqual(pronounce_number(209996, lang="it"), "duecento novemila, novecento novantasei") self.assertEqual(pronounce_number(95505896639631893, lang="it"), "novantacinquebiliardi, cinquecento cinquebilioni, " "ottocento novantaseimiliardi, " "seicento trentanovemilioni, seicento " "trentunomila, ottocento novantatre") self.assertEqual(pronounce_number(95505896639631893, short_scale=False, lang="it"), "novantacinquemila cinquecento cinque miliardi, " "ottocento novantaseimila seicento trentanove" " milioni, seicento trentunomila, ottocento" " novantatre") def test_convert_scientific_notation(self): """ Test cases for italian text to scientific_notatio """ self.assertEqual(pronounce_number(0, scientific=True, lang="it"), "zero") self.assertEqual(pronounce_number(33, scientific=True, lang="it"), "tre virgola tre per dieci elevato alla uno") self.assertEqual(pronounce_number(299792458, scientific=True, lang="it"), "due virgola nove nove per dieci elevato alla otto") self.assertEqual(pronounce_number(299792458, places=6, scientific=True, lang="it"), "due virgola nove nove sette nove due cinque " "per dieci elevato alla otto") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, lang="it"), "uno virgola sei sette due per dieci elevato alla " "meno ventisette") self.assertEqual(pronounce_number(-33, scientific=True, lang="it"), "meno tre virgola tre per dieci elevato alla uno") self.assertEqual(pronounce_number(-299792458, scientific=True, lang="it"), "meno due virgola nove nove per dieci elevato" " alla otto") self.assertEqual(pronounce_number(-1.672e-27, places=3, scientific=True, lang="it"), "meno uno virgola sei sette due per dieci elevato" " alla meno ventisette") def test_large_numbers(self): self.assertEqual( pronounce_number(299792458, short_scale=True, lang="it"), "duecento novantanovemilioni, settecento " "novantaduemila, quattrocento cinquantotto") self.assertEqual( pronounce_number(299792458, short_scale=False, lang="it"), "duecento novantanove milioni, settecento " "novantaduemila, quattrocento cinquantotto") self.assertEqual( pronounce_number(100034000000299792458, short_scale=True, lang="it"), "centotrilioni, trentaquattrobiliardi, " "duecento novantanovemilioni, settecento " "novantaduemila, quattrocento cinquantotto") self.assertEqual( pronounce_number(100034000000299792458, short_scale=False, lang="it"), "cento bilioni, trentaquattromila miliardi, " "duecento novantanove milioni, settecento " "novantaduemila, quattrocento cinquantotto") self.assertEqual( pronounce_number(10000000000, short_scale=True, lang="it"), "diecimiliardi") self.assertEqual( pronounce_number(1000000000000, short_scale=True, lang="it"), "bilioni") self.assertEqual( pronounce_number(1000001, short_scale=True, lang="it"), "un milione, uno") self.assertEqual( pronounce_number(1000000000, short_scale=False, lang="it"), "un miliardo") self.assertEqual( pronounce_number(1000000, short_scale=False, lang="it"), "un milione") self.assertEqual( pronounce_number(1000, short_scale=False, lang="it"), "mille") self.assertEqual( pronounce_number(1000900, short_scale=False, lang="it"), "uno milioni, novecento") def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt, lang="it-it"), nice_time(dt, "it-it", True, False, False)) self.assertEqual(nice_time(dt, lang="it"), "una e ventidue") self.assertEqual(nice_time(dt, lang="it", use_ampm=True), "una e ventidue del pomeriggio") self.assertEqual(nice_time(dt, lang="it", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="it", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=True), "tredici e ventidue") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=False), "tredici e ventidue") # Verifica fasce orarie use_ampm = True d_time = datetime.datetime(2017, 1, 31, 8, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_ampm=True), "otto e ventidue della mattina") d_time = datetime.datetime(2017, 1, 31, 20, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_ampm=True), "otto e ventidue della sera") d_time = datetime.datetime(2017, 1, 31, 23, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_ampm=True), "undici e ventidue della notte") d_time = datetime.datetime(2017, 1, 31, 00, 00, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_ampm=True), "mezzanotte") d_time = datetime.datetime(2017, 1, 31, 12, 00, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_ampm=True), "mezzogiorno") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="it"), "una in punto") self.assertEqual(nice_time(dt, lang="it", use_ampm=True), "una del pomeriggio") self.assertEqual(nice_time(dt, lang="it", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="it", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=True), "tredici e zerozero") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=False), "tredici e zerozero") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="it", use_24hour=True), "tredici e zero due") self.assertEqual(nice_time(dt, lang="it", use_ampm=True), "una e zero due del pomeriggio") self.assertEqual(nice_time(dt, lang="it", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="it", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=True), "tredici e zero due") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=False), "tredici e zero due") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="it"), "mezzanotte e zero due") self.assertEqual(nice_time(dt, lang="it", use_ampm=True), "mezzanotte e zero due") self.assertEqual(nice_time(dt, lang="it", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="it", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="it", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=True), "zerozero e zero due") self.assertEqual(nice_time(dt, lang="it", use_24hour=True, use_ampm=False), "zerozero e zero due") # casi particolari d_time = datetime.datetime(2017, 1, 31, 1, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_24hour=True, use_ampm=True), "una e zero due") d_time = datetime.datetime(2017, 1, 31, 2, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_24hour=True, use_ampm=False), "zero due e zero due") d_time = datetime.datetime(2017, 1, 31, 10, 15, 0, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_24hour=False, use_ampm=False), "dieci e un quarto") d_time = datetime.datetime(2017, 1, 31, 22, 45, 0, tzinfo=default_timezone()) self.assertEqual(nice_time(d_time, lang="it", use_24hour=False, use_ampm=False), "dieci e tre quarti") def test_infinity(self): self.assertEqual(pronounce_number(sys.float_info.max * 2, lang="it"), "infinito") self.assertEqual(pronounce_number(float("inf"), lang="it"), "infinito") self.assertEqual(pronounce_number(float("-inf"), lang="it"), "meno infinito") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_nl.py000066400000000000000000000431021426211343400224020ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.lang.format_nl import nice_response_nl from lingua_franca.lang.format_nl import pronounce_ordinal_nl from lingua_franca.time import default_timezone def setUpModule(): load_language('nl') def tearDownModule(): unload_language('nl') # fractions are not capitalized for now NUMBERS_FIXTURE_NL = { 1.435634: '1,436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345,679', 0.027: '0,027', 0.5: 'één half', 1.333: '1 en één derde', 2.666: '2 en 2 derde', 0.25: 'één vierde', 1.25: '1 en één vierde', 0.75: '3 vierde', 1.75: '1 en 3 vierde', 3.4: '3 en 2 vijfde', 16.8333: '16 en 5 zesde', 12.5714: '12 en 4 zevende', 9.625: '9 en 5 achtste', 6.777: '6 en 7 negende', 3.1: '3 en één tiende', 2.272: '2 en 3 elfde', 5.583: '5 en 7 twaalfde', 8.384: '8 en 5 dertiende', 0.071: 'één veertiende', 6.466: '6 en 7 vijftiende', 8.312: '8 en 5 zestiende', 2.176: '2 en 3 zeventiende', 200.722: '200 en 13 achttiende', 7.421: '7 en 8 negentiende', 0.05: 'één twintigste' } class TestNiceResponse(unittest.TestCase): def test_replace_ordinal(self): self.assertEqual(nice_response_nl("dit is 31 mei"), "dit is éénendertig mei") self.assertEqual(nice_response_nl("het begint op 31 mei"), "het begint op éénendertig mei") self.assertEqual(nice_response_nl("31 mei"), "éénendertig mei") self.assertEqual(nice_response_nl("10 ^ 2"), "10 tot de macht 2") class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_NL.items(): self.assertEqual(nice_number(number, lang="nl-nl"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number, lang="nl-nl"))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, lang="nl-nl", denominators=[1, 2, 3]), '5 en één half', 'should format 5.5 as 5 en één half not {}'.format( nice_number(5.5, lang="nl-nl", denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="nl-nl", denominators=[1, 2]), '2,333', 'should format 2,333 as 2,333 not {}'.format( nice_number(2.333, lang="nl-nl", denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="nl-nl", speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="nl-nl", speech=False))) class TestPronounceOrdinal(unittest.TestCase): def test_convert_int_nl(self): self.assertEqual(pronounce_ordinal_nl(0), "nulste") self.assertEqual(pronounce_ordinal_nl(1), "eerste") self.assertEqual(pronounce_ordinal_nl(3), "derde") self.assertEqual(pronounce_ordinal_nl(5), "vijfde") self.assertEqual(pronounce_ordinal_nl(1000), "éénduizendste") self.assertEqual( pronounce_ordinal_nl(123456), "éénhonderddrieentwintigduizendvierhonderdzesenvijftigste" ) # def pronounce_number(number, lang="nl-nl", places=2): class TestPronounceNumber(unittest.TestCase): def test_convert_int_nl(self): self.assertEqual(pronounce_number(123456789123456789, lang="nl-nl"), "éénhonderddrieentwintig biljard " "vierhonderdzesenvijftig biljoen " "zevenhonderdnegenentachtig miljard " "éénhonderddrieentwintig miljoen " "vierhonderdzesenvijftigduizend" "zevenhonderdnegenentachtig") self.assertEqual(pronounce_number(1, lang="nl-nl"), "één") self.assertEqual(pronounce_number(10, lang="nl-nl"), "tien") self.assertEqual(pronounce_number(15, lang="nl-nl"), "vijftien") self.assertEqual(pronounce_number(20, lang="nl-nl"), "twintig") self.assertEqual(pronounce_number(27, lang="nl-nl"), "zevenentwintig") self.assertEqual(pronounce_number(30, lang="nl-nl"), "dertig") self.assertEqual(pronounce_number(33, lang="nl-nl"), "drieendertig") self.assertEqual(pronounce_number(71, lang="nl-nl"), "éénenzeventig") self.assertEqual(pronounce_number(80, lang="nl-nl"), "tachtig") self.assertEqual(pronounce_number(74, lang="nl-nl"), "vierenzeventig") self.assertEqual(pronounce_number(79, lang="nl-nl"), "negenenzeventig") self.assertEqual(pronounce_number(91, lang="nl-nl"), "éénennegentig") self.assertEqual(pronounce_number(97, lang="nl-nl"), "zevenennegentig") self.assertEqual(pronounce_number(300, lang="nl-nl"), "driehonderd") def test_convert_negative_int_nl(self): self.assertEqual(pronounce_number(-1, lang="nl-nl"), "min één") self.assertEqual(pronounce_number(-10, lang="nl-nl"), "min tien") self.assertEqual(pronounce_number(-15, lang="nl-nl"), "min vijftien") self.assertEqual(pronounce_number(-20, lang="nl-nl"), "min twintig") self.assertEqual(pronounce_number(-27, lang="nl-nl"), "min zevenentwintig") self.assertEqual(pronounce_number(-30, lang="nl-nl"), "min dertig") self.assertEqual(pronounce_number(-33, lang="nl-nl"), "min drieendertig") def test_convert_decimals_nl(self): self.assertEqual(pronounce_number(1.234, lang="nl-nl"), "één komma twee drie") self.assertEqual(pronounce_number(21.234, lang="nl-nl"), "éénentwintig komma twee drie") self.assertEqual(pronounce_number(21.234, lang="nl-nl", places=1), "éénentwintig komma twee") self.assertEqual(pronounce_number(21.234, lang="nl-nl", places=0), "éénentwintig") self.assertEqual(pronounce_number(21.234, lang="nl-nl", places=3), "éénentwintig komma twee drie vier") self.assertEqual(pronounce_number(21.234, lang="nl-nl", places=4), "éénentwintig komma twee drie vier nul") self.assertEqual(pronounce_number(21.234, lang="nl-nl", places=5), "éénentwintig komma twee drie vier nul nul") self.assertEqual(pronounce_number(-1.234, lang="nl-nl"), "min één komma twee drie") self.assertEqual(pronounce_number(-21.234, lang="nl-nl"), "min éénentwintig komma twee drie") self.assertEqual(pronounce_number(-21.234, lang="nl-nl", places=1), "min éénentwintig komma twee") self.assertEqual(pronounce_number(-21.234, lang="nl-nl", places=0), "min éénentwintig") self.assertEqual(pronounce_number(-21.234, lang="nl-nl", places=3), "min éénentwintig komma twee drie vier") self.assertEqual(pronounce_number(-21.234, lang="nl-nl", places=4), "min éénentwintig komma twee drie vier nul") self.assertEqual(pronounce_number(-21.234, lang="nl-nl", places=5), "min éénentwintig komma twee drie vier nul nul") # def nice_time(dt, lang="nl-nl", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_nl(unittest.TestCase): def test_convert_times_nl(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "tweeentwintig over één") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "tweeentwintig over één 's middags") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "dertien uur tweeentwintig") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "dertien uur tweeentwintig") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "één uur") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "één uur 's middags") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "dertien uur") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "dertien uur") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "twee over één") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "twee over één 's middags") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "dertien uur twee") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "dertien uur twee") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "twee over twaalf") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "twee over twaalf 's nachts") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "nul uur twee") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "nul uur twee") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "kwart over twaalf") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "kwart over twaalf 's middags") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "twaalf uur vijftien") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "twaalf uur vijftien") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "twintig voor acht") self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "twintig voor acht 's avonds") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="nl-nl", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=True), "negentien uur veertig") self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True, use_ampm=False), "negentien uur veertig") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl", use_24hour=True), "één uur vijftien") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "vijfentwintig voor twee") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "kwart voor twee") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "tien voor vijf") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl"), "vijf voor zes") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="nl-nl", use_ampm=True), "half zes 's nachts") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_pl.py000077500000000000000000000423751426211343400224220ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime import sys from lingua_franca import get_default_lang, set_default_lang, \ load_language, unload_language from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_duration from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language("pl-pl") set_default_lang("pl") def tearDownModule(): unload_language("pl") NUMBERS_FIXTURE_PL = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: '1 druga', 1.333: '1 i 1 trzecia', 2.666: '2 i 2 trzecie', 0.25: '1 czwarta', 1.25: '1 i 1 czwarta', 0.75: '3 czwarte', 1.75: '1 i 3 czwarte', 3.4: '3 i 2 piąte', 16.8333: '16 i 5 szóste', 12.5714: '12 i 4 siódme', 9.625: '9 i 5 ósme', 6.777: '6 i 7 dziewiąte', 3.1: '3 i 1 dziesiąta', 2.272: '2 i 3 jedenaste', 5.583: '5 i 7 dwunaste', 8.384: '8 i 5 trzynaste', 0.071: '1 czternasta', 6.466: '6 i 7 piętnaste', 8.312: '8 i 5 szesnaste', 2.176: '2 i 3 siedemnaste', 200.722: '200 i 13 osiemnaste', 7.421: '7 i 8 dziewiętnaste', 0.05: '1 dwudziesta' } class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_PL.items(): self.assertEqual(nice_number(number, lang='pl'), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), '5 i 1 druga', 'should format 5.5 as 5 and a half not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), '2.333', 'should format 2.333 as 2.333 not {}'.format( nice_number(2.333, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "zero") self.assertEqual(pronounce_number(1), "jeden") self.assertEqual(pronounce_number(10), "dziesięć") self.assertEqual(pronounce_number(15), "piętnaście") self.assertEqual(pronounce_number(20), "dwadzieścia") self.assertEqual(pronounce_number(27), "dwadzieścia siedem") self.assertEqual(pronounce_number(30), "trzydzieści") self.assertEqual(pronounce_number(33), "trzydzieści trzy") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "minus jeden") self.assertEqual(pronounce_number(-10), "minus dziesięć") self.assertEqual(pronounce_number(-15), "minus piętnaście") self.assertEqual(pronounce_number(-20), "minus dwadzieścia") self.assertEqual(pronounce_number(-27), "minus dwadzieścia siedem") self.assertEqual(pronounce_number(-30), "minus trzydzieści") self.assertEqual(pronounce_number(-33), "minus trzydzieści trzy") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "zero przecinek zero pięć") self.assertEqual(pronounce_number(-0.05), "minus zero przecinek zero pięć") self.assertEqual(pronounce_number(1.234), "jeden przecinek dwa trzy") self.assertEqual(pronounce_number(21.234), "dwadzieścia jeden przecinek dwa trzy") self.assertEqual(pronounce_number(21.234, places=1), "dwadzieścia jeden przecinek dwa") self.assertEqual(pronounce_number(21.234, places=0), "dwadzieścia jeden") self.assertEqual(pronounce_number(21.234, places=3), "dwadzieścia jeden przecinek dwa trzy cztery") self.assertEqual(pronounce_number(21.234, places=4), "dwadzieścia jeden przecinek dwa trzy cztery") self.assertEqual(pronounce_number(21.234, places=5), "dwadzieścia jeden przecinek dwa trzy cztery") self.assertEqual(pronounce_number(-1.234), "minus jeden przecinek dwa trzy") self.assertEqual(pronounce_number(-21.234), "minus dwadzieścia jeden przecinek dwa trzy") self.assertEqual(pronounce_number(-21.234, places=1), "minus dwadzieścia jeden przecinek dwa") self.assertEqual(pronounce_number(-21.234, places=0), "minus dwadzieścia jeden") self.assertEqual(pronounce_number(-21.234, places=3), "minus dwadzieścia jeden przecinek dwa trzy cztery") self.assertEqual(pronounce_number(-21.234, places=4), "minus dwadzieścia jeden przecinek dwa trzy cztery") self.assertEqual(pronounce_number(-21.234, places=5), "minus dwadzieścia jeden przecinek dwa trzy cztery") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100), "sto") self.assertEqual(pronounce_number(666), "sześćset sześćdziesiąt sześć") self.assertEqual(pronounce_number(1456), "jeden tysiąc, czterysta pięćdziesiąt sześć") self.assertEqual(pronounce_number(103254654), "sto trzy miliony, dwieście " "pięćdziesiąt cztery tysiące, sześćset " "pięćdziesiąt cztery") self.assertEqual(pronounce_number(1512457), "jeden milion, pięćset dwanaście tysięcy, czterysta " "pięćdziesiąt siedem") self.assertEqual(pronounce_number(209996), "dwieście dziewięć tysięcy, dziewięćset " "dziewięćdziesiąt sześć") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "zero") self.assertEqual(pronounce_number(33, scientific=True), "trzy przecinek trzy razy dziesięć do potęgi jeden") self.assertEqual(pronounce_number(299792458, scientific=True), "dwa przecinek dziewięć dziewięć razy dziesięć do potęgi osiem") self.assertEqual(pronounce_number(299792458, places=6, scientific=True), "dwa przecinek dziewięć dziewięć siedem dziewięć dwa pięć razy " "dziesięć do potęgi osiem") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "jeden przecinek sześć siedem dwa razy dziesięć do potęgi " "minus dwadzieścia siedem") def test_auto_scientific_notation(self): self.assertEqual( pronounce_number(1.1e-150), "jeden przecinek jeden razy dziesięć do " "potęgi minus sto pięćdziesiąt") def test_large_numbers(self): self.assertEqual( pronounce_number(299792458), "dwieście dziewięćdziesiąt dziewięć milionów, siedemset " "dziewięćdziesiąt dwa tysiące, czterysta pięćdziesiąt osiem") self.assertEqual( pronounce_number(100034000000299792458), "sto trylionów, trzydzieści cztery biliardy, " "dwieście dziewięćdziesiąt dziewięć milionów, siedemset " "dziewięćdziesiąt dwa tysiące, czterysta pięćdziesiąt osiem") self.assertEqual( pronounce_number(10000000000), "dziesięć miliardów") self.assertEqual( pronounce_number(1000001), "jeden milion, jeden") self.assertEqual(pronounce_number(95505896639631893), "dziewięćdziesiąt pięć biliardów, pięćset pięć " "bilionów, osiemset dziewięćdziesiąt sześć miliardów, " "sześćset trzydzieści dziewięć milionów, sześćset " "trzydzieści jeden tysiące, osiemset dziewięćdziesiąt trzy") self.assertEqual(pronounce_number(10e80, places=1), "tredecyliard") self.assertEqual(pronounce_number(1.9874522571e80, places=9), "sto dziewięćdziesiąt osiem tredecylionów, " "siedemset czterdzieści pięć duodecyliardów, " "dwieście dwadzieścia pięć duodecylionów, " "siedemset dziewięć undecyliardów, " "dziewięćset dziewięćdziesiąt dziewięć undecylionów, " "dziewięćset osiemdziesiąt dziewięć decyliardów, " "siedemset trzydzieści decylionów, dziewięćset " "dziewiętnaście noniliardów, dziewięćset " "dziewięćdziesiąt dziewięć nonilionów, dziewięćset " "pięćdziesiąt pięć oktyliardów, czterysta " "dziewięćdziesiąt osiem oktylionów, dwieście " "czternaście septyliardy, osiemset " "czterdzieści pięć septylionów, czterysta " "dwadzieścia dziewięć sekstyliardów, czterysta " "czterdzieści cztery sekstyliony, trzysta " "trzydzieści sześć kwintyliardów, siedemset dwadzieścia " "cztery kwintyliony, pięćset sześćdziesiąt dziewięć " "kwadryliardów, trzysta siedemdziesiąt pięć " "kwadrylionów, dwieście trzydzieści dziewięć sekstilionów," " sześćset siedemdziesiąt trylionów, pięćset " "siedemdziesiąt cztery biliardy, siedemset " "trzydzieści dziewięć bilionów, siedemset czterdzieści " "osiem miliardów, czterysta siedemdziesiąt milionów, " "dziewięćset piętnaście tysięcy, siedemdziesiąt dwa") # infinity self.assertEqual( pronounce_number(sys.float_info.max * 2), "nieskończoność") self.assertEqual( pronounce_number(float("inf")), "nieskończoność") self.assertEqual( pronounce_number(float("-inf")), "minus nieskończoność") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "pierwszy") self.assertEqual(pronounce_number(10, ordinals=True), "dziesiąty") self.assertEqual(pronounce_number(15, ordinals=True), "piętnasty") self.assertEqual(pronounce_number(20, ordinals=True), "dwudziesty") self.assertEqual(pronounce_number(27, ordinals=True), "dwudziesty siódmy") self.assertEqual(pronounce_number(30, ordinals=True), "trzydziesty") self.assertEqual(pronounce_number(33, ordinals=True), "trzydziesty trzeci") self.assertEqual(pronounce_number(100, ordinals=True), "setny") self.assertEqual(pronounce_number(1000, ordinals=True), "tysięczny") self.assertEqual(pronounce_number(10000, ordinals=True), "dziesięcio tysięczny") self.assertEqual(pronounce_number(18691, ordinals=True), "osiemnaście tysięcy, sześćset dziewięćdziesiąty pierwszy") self.assertEqual(pronounce_number(1567, ordinals=True), "jeden tysiąc, pięćset sześćdziesiąty siódmy") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, ordinals=True), "jeden przecinek sześć siedem dwa razy dziesięć do " "minus dwudziestej siódmej potęgi") self.assertEqual(pronounce_number(18e6, ordinals=True), "osiemnasto milionowa") self.assertEqual(pronounce_number(18e12, ordinals=True), "osiemnasto bilionowa") self.assertEqual(pronounce_number(18e18, ordinals=True, short_scale=False), "osiemnasto " "trylionowa") class TestNiceDateFormat(unittest.TestCase): def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "trzynasta dwadzieścia dwa") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True), "trzynasta dwadzieścia dwa") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "trzynasta zero zero") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True), "trzynasta zero zero") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "trzynasta dwa") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "trzynasta dwa") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dwa po północy") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "dwa po północy") dt = datetime.datetime(2018, 2, 8, 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "pierwsza dwa") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "pierwsza dwa") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dwunasta piętnaście") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "pierwsza czterdzieści pięć") def test_nice_duration(self): self.assertEqual(nice_duration(1), "jedna sekunda") self.assertEqual(nice_duration(3), "trzy sekundy") self.assertEqual(nice_duration(1, speech=False), "0:01") self.assertEqual(nice_duration(61), "jedna minuta jedna sekunda") self.assertEqual(nice_duration(61, speech=False), "1:01") self.assertEqual(nice_duration(5000), "jedna godzina dwadzieścia trzy minuty dwadzieścia sekund") self.assertEqual(nice_duration(5000, speech=False), "1:23:20") self.assertEqual(nice_duration(50000), "trzynaście godzin pięćdziesiąt trzy minuty dwadzieścia sekund") self.assertEqual(nice_duration(50000, speech=False), "13:53:20") self.assertEqual(nice_duration(500000), "pięć dni osiemnaście godzin pięćdziesiąt trzy minuty dwadzieścia sekund") # nopep8 self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), speech=False), "5d 18:53:20") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_pt.py000066400000000000000000000347731426211343400224320ustar00rootroot00000000000000# # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language('pt-pt') set_default_lang('pt') def tearDownModule(): unload_language('pt') NUMBERS_FIXTURE_PT = { 1.435634: '1,436', 2: '2', 5.0: '5', 0.027: '0,027', 0.5: 'um meio', 1.333: '1 e 1 terço', 2.666: '2 e 2 terços', 0.25: 'um quarto', 1.25: '1 e 1 quarto', 0.75: '3 quartos', 1.75: '1 e 3 quartos', 3.4: '3 e 2 quintos', 16.8333: '16 e 5 sextos', 12.5714: '12 e 4 sétimos', 9.625: '9 e 5 oitavos', 6.777: '6 e 7 nonos', 3.1: '3 e 1 décimo', 2.272: '2 e 3 onze avos', 5.583: '5 e 7 doze avos', 8.384: '8 e 5 treze avos', 0.071: 'catorze avos', 6.466: '6 e 7 quinze avos', 8.312: '8 e 5 dezasséis avos', 2.176: '2 e 3 dezassete avos', 200.722: '200 e 13 dezoito avos', 7.421: '7 e 8 dezanove avos', 0.05: 'um vigésimo' } class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0, lang="pt"), "zero") self.assertEqual(pronounce_number(1, lang="pt"), "um") self.assertEqual(pronounce_number(10, lang="pt"), "dez") self.assertEqual(pronounce_number(15, lang="pt"), "quinze") self.assertEqual(pronounce_number(21, lang="pt"), "vinte e um") self.assertEqual(pronounce_number(27, lang="pt"), "vinte e sete") self.assertEqual(pronounce_number(30, lang="pt"), "trinta") self.assertEqual(pronounce_number(19, lang="pt"), "dezanove") self.assertEqual(pronounce_number(88, lang="pt"), "oitenta e oito") self.assertEqual(pronounce_number(46, lang="pt"), "quarenta e seis") self.assertEqual(pronounce_number(99, lang="pt"), "noventa e nove") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1, lang="pt"), "menos um") self.assertEqual(pronounce_number(-10, lang="pt"), "menos dez") self.assertEqual(pronounce_number(-15, lang="pt"), "menos quinze") self.assertEqual(pronounce_number(-21, lang="pt"), "menos vinte e um") self.assertEqual(pronounce_number(-27, lang="pt"), "menos vinte e sete") self.assertEqual(pronounce_number(-30, lang="pt"), "menos trinta") self.assertEqual(pronounce_number(-35, lang="pt"), "menos trinta e cinco") self.assertEqual(pronounce_number(-83, lang="pt"), "menos oitenta e três") self.assertEqual(pronounce_number(-19, lang="pt"), "menos dezanove") self.assertEqual(pronounce_number(-88, lang="pt"), "menos oitenta e oito") self.assertEqual(pronounce_number(-46, lang="pt"), "menos quarenta e seis") self.assertEqual(pronounce_number(-99, lang="pt"), "menos noventa e nove") def test_convert_decimals(self): self.assertEqual(pronounce_number( 0.05, lang="pt"), "zero vírgula zero cinco") self.assertEqual(pronounce_number( -0.05, lang="pt"), "menos zero vírgula zero cinco") self.assertEqual(pronounce_number(1.234, lang="pt"), "um vírgula dois três") self.assertEqual(pronounce_number(21.234, lang="pt"), "vinte e um vírgula dois três") self.assertEqual(pronounce_number(21.234, lang="pt", places=1), "vinte e um vírgula dois") self.assertEqual(pronounce_number(21.234, lang="pt", places=0), "vinte e um") self.assertEqual(pronounce_number(21.234, lang="pt", places=3), "vinte e um vírgula dois três quatro") self.assertEqual(pronounce_number(21.234, lang="pt", places=4), "vinte e um vírgula dois três quatro") self.assertEqual(pronounce_number(20.234, lang="pt", places=5), "vinte vírgula dois três quatro") self.assertEqual(pronounce_number(-21.234, lang="pt"), "menos vinte e um vírgula dois três") self.assertEqual(pronounce_number(-21.234, lang="pt", places=1), "menos vinte e um vírgula dois") self.assertEqual(pronounce_number(-21.234, lang="pt", places=0), "menos vinte e um") self.assertEqual(pronounce_number(-21.234, lang="pt", places=3), "menos vinte e um vírgula dois três quatro") self.assertEqual(pronounce_number(-21.234, lang="pt", places=4), "menos vinte e um vírgula dois três quatro") self.assertEqual(pronounce_number(-21.234, lang="pt", places=5), "menos vinte e um vírgula dois três quatro") class TestNiceDateFormat(unittest.TestCase): def test_pm(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt, lang="pt-pt"), nice_time(dt, "pt-pt", True, False, False)) self.assertEqual(nice_time(dt, lang="pt"), "uma e vinte e dois") self.assertEqual(nice_time(dt, lang="pt", use_ampm=True), "uma e vinte e dois da tarde") self.assertEqual(nice_time(dt, lang="pt", speech=False), "1:22") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=True), "treze e vinte e dois") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=False), "treze e vinte e dois") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt"), "uma em ponto") self.assertEqual(nice_time(dt, lang="pt", use_ampm=True), "uma da tarde") self.assertEqual(nice_time(dt, lang="pt", speech=False), "1:00") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=True), "treze") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt", use_24hour=True), "treze e dois") self.assertEqual(nice_time(dt, lang="pt", use_ampm=True), "uma e dois da tarde") self.assertEqual(nice_time(dt, lang="pt", speech=False), "1:02") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=True), "treze e dois") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=False), "treze e dois") def test_midnight(self): dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt"), "meia noite e dois") self.assertEqual(nice_time(dt, lang="pt", use_ampm=True), "meia noite e dois") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True), "zero e dois") self.assertEqual(nice_time(dt, lang="pt", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="pt", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=True), "zero e dois") self.assertEqual(nice_time(dt, lang="pt", use_24hour=True, use_ampm=False), "zero e dois") def test_midday(self): dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "meio dia e um quarto") self.assertEqual(nice_time(dt, lang="pt-pt", use_ampm=True), "meio dia e um quarto") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True, use_ampm=True), "doze e quinze") self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True, use_ampm=False), "doze e quinze") def test_minutes_to_hour(self): # "twenty minutes to midnight" dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "oito menos vinte") self.assertEqual(nice_time(dt, lang="pt-pt", use_ampm=True), "oito menos vinte da tarde") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False), "7:40") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_ampm=True), "7:40 PM") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="pt-pt", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True, use_ampm=True), "dezanove e quarenta") self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True, use_ampm=False), "dezanove e quarenta") def test_minutes_past_hour(self): # "quarter past ten" dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True), "uma e quinze") self.assertEqual(nice_time(dt, lang="pt-pt"), "uma e um quarto") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "duas menos vinte e cinco") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "duas menos um quarto") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "cinco menos dez") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt"), "seis menos cinco") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt", use_ampm=True), "cinco e meia da madrugada") dt = datetime.datetime(2017, 1, 31, 23, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=True, use_ampm=True), "vinte e três e quinze") self.assertEqual(nice_time(dt, lang="pt-pt", use_24hour=False, use_ampm=True), "onze e um quarto da noite") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_ru.py000066400000000000000000000765601426211343400224350ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import unittest import datetime import ast import sys from pathlib import Path from lingua_franca import get_default_lang, set_default_lang, \ load_language, unload_language from lingua_franca.format import date_time_format from lingua_franca.format import join_list from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time from lingua_franca.format import nice_duration from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_year from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone def setUpModule(): load_language("ru-ru") set_default_lang("ru") def tearDownModule(): unload_language("ru") NUMBERS_FIXTURE_RU = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: 'половина', 1.333: '1 и 1 треть', 2.666: '2 и 2 трети', 0.25: 'четверть', 1.25: '1 и 1 четверть', 0.75: '3 четверти', 1.75: '1 и 3 четверти', 3.4: '3 и 2 пятые', 16.8333: '16 и 5 шестых', 12.5714: '12 и 4 седьмые', 9.625: '9 и 5 восьмых', 6.777: '6 и 7 девятых', 3.1: '3 и 1 десятая', 2.272: '2 и 3 одиннадцатые', 5.583: '5 и 7 двенадцатых', 8.384: '8 и 5 тринадцатых', 0.071: '1 четырнадцатая', 6.466: '6 и 7 пятнадцатых', 8.312: '8 и 5 шестнадцатых', 2.176: '2 и 3 семнадцатые', 200.722: '200 и 13 восемнадцатых', 7.421: '7 и 8 девятнадцатых', 0.05: '1 двадцатая' } class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_RU.items(): self.assertEqual(nice_number(number, speech=True), number_str, 'должен отформатировать {} как {}, а не {}'.format( number, number_str, nice_number(number, speech=True))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, speech=True, denominators=[1, 2, 3]), '5 с половиной', 'должен отформатировать 5.5 как 5 с половиной, а не {}'.format( nice_number(5.5, speech=True, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, speech=True, denominators=[1, 2]), '2.333', 'должен отформатировать 2.333 как 2.333, а не {}'.format( nice_number(2.333, speech=True, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'должен отформатировать 6.777 как 6 7/9, а не {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'должен отформатировать 6.0 как 6, а не {}'.format( nice_number(6.0, speech=False))) class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "ноль") self.assertEqual(pronounce_number(1), "один") self.assertEqual(pronounce_number(10), "десять") self.assertEqual(pronounce_number(15), "пятнадцать") self.assertEqual(pronounce_number(20), "двадцать") self.assertEqual(pronounce_number(27), "двадцать семь") self.assertEqual(pronounce_number(30), "тридцать") self.assertEqual(pronounce_number(33), "тридцать три") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "минус один") self.assertEqual(pronounce_number(-10), "минус десять") self.assertEqual(pronounce_number(-15), "минус пятнадцать") self.assertEqual(pronounce_number(-20), "минус двадцать") self.assertEqual(pronounce_number(-27), "минус двадцать семь") self.assertEqual(pronounce_number(-30), "минус тридцать") self.assertEqual(pronounce_number(-33), "минус тридцать три") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "ноль точка ноль пять") self.assertEqual(pronounce_number(-0.05), "минус ноль точка ноль пять") self.assertEqual(pronounce_number(1.234), "один точка два три") self.assertEqual(pronounce_number(21.234), "двадцать один точка два три") self.assertEqual(pronounce_number(21.234, places=1), "двадцать один точка два") self.assertEqual(pronounce_number(21.234, places=0), "двадцать один") self.assertEqual(pronounce_number(21.234, places=3), "двадцать один точка два три четыре") self.assertEqual(pronounce_number(21.234, places=4), "двадцать один точка два три четыре") self.assertEqual(pronounce_number(21.234, places=5), "двадцать один точка два три четыре") self.assertEqual(pronounce_number(-1.234), "минус один точка два три") self.assertEqual(pronounce_number(-21.234), "минус двадцать один точка два три") self.assertEqual(pronounce_number(-21.234, places=1), "минус двадцать один точка два") self.assertEqual(pronounce_number(-21.234, places=0), "минус двадцать один") self.assertEqual(pronounce_number(-21.234, places=3), "минус двадцать один точка два три четыре") self.assertEqual(pronounce_number(-21.234, places=4), "минус двадцать один точка два три четыре") self.assertEqual(pronounce_number(-21.234, places=5), "минус двадцать один точка два три четыре") def test_convert_stos(self): self.assertEqual(pronounce_number(100), "сто") self.assertEqual(pronounce_number(666), "шестьсот шестьдесят шесть") self.assertEqual(pronounce_number(1456), "тысяча четыреста пятьдесят шесть") self.assertEqual(pronounce_number(103254654), "сто три миллиона " "двести пятьдесят " "четыре тысячи " "шестьсот " "пятьдесят четыре") self.assertEqual(pronounce_number(1512457), "миллион пятьсот" " двенадцать тысяч " "четыреста пятьдесят " "семь") self.assertEqual(pronounce_number(209996), "двести девять " "тысяч девятьсот " "девяносто шесть") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "ноль") self.assertEqual(pronounce_number(33, scientific=True), "три точка три на десять в степени один") self.assertEqual(pronounce_number(299792458, scientific=True), "два точка девять девять на десять в степени восемь") self.assertEqual(pronounce_number(299792458, places=6, scientific=True), "два точка девять девять семь девять два пять " "на десять в степени восемь") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "один точка шесть семь два на десять в степени " "минус двадцать семь") def test_auto_scientific_notation(self): self.assertEqual( pronounce_number(1.1e-150), "один точка один на десять в степени " "минус сто пятьдесят") def test_large_numbers(self): self.maxDiff = None self.assertEqual( pronounce_number(299792458, short_scale=True), "двести девяносто девять миллионов семьсот " "девяносто две тысячи четыреста пятьдесят восемь") self.assertEqual( pronounce_number(299792458, short_scale=False), "двести девяносто девять миллионов семьсот " "девяносто две тысячи четыреста пятьдесят восемь") self.assertEqual( pronounce_number(100034000000299792458, short_scale=True), "сто квинтиллионов тридцать четыре квадриллиона " "двести девяносто девять миллионов семьсот " "девяносто две тысячи четыреста пятьдесят восемь") self.assertEqual( pronounce_number(100034000000299792458, short_scale=False), "сто биллионов тридцать четыре тысячи миллиардов " "двести девяносто девять миллионов семьсот " "девяносто две тысячи четыреста пятьдесят восемь") self.assertEqual( pronounce_number(1e10, short_scale=True), "десять миллиардов") self.assertEqual( pronounce_number(1e12, short_scale=True), "триллион") # TODO maybe beautify this self.assertEqual( pronounce_number(1000001, short_scale=True), "миллион один") self.assertEqual(pronounce_number(95505896639631893, short_scale=True), "девяносто пять квадриллионов " "пятьсот пять триллионов " "восемьсот девяносто шесть миллиардов " "шестьсот тридцать девять миллионов " "шестьсот тридцать одна тысяча " "восемьсот девяносто три") self.assertEqual(pronounce_number(95505896639631893, short_scale=False), "девяносто пять тысяч пятьсот пять миллиардов " "восемьсот девяносто шесть тысяч " "шестьсот тридцать девять миллионов " "шестьсот тридцать одна тысяча " "восемьсот девяносто три") self.assertEqual(pronounce_number(10e80, places=1), "секснвигинтиллион") # TODO floating point rounding issues might happen self.assertEqual(pronounce_number(1.9874522571e80, places=9), "сто девяносто восемь квинвигинтиллионов " "семьсот сорок пять кватторвигинтиллионов " "двести двадцать пять тревигинтиллионов " "семьсот девять дуовигинтиллионов " "девятьсот девяносто девять унвигинтиллионов " "девятьсот восемьдесят девять вигинтиллионов " "семьсот тридцать новемдециллионов " "девятьсот девятнадцать октодециллионов " "девятьсот девяносто девять септендециллионов " "девятьсот пятьдесят пять сексдециллионов " "четыреста девяносто восемь квиндециллионов " "двести четырнадцать кваттордециллионов " "восемьсот сорок пять тредециллионов " "четыреста двадцать девять дуодециллионов " "четыреста сорок четыре ундециллиона " "триста тридцать шесть дециллионов " "семьсот двадцать четыре нониллиона " "пятьсот шестьдесят девять октиллионов " "триста семьдесят пять септиллионов " "двести тридцать девять секстиллионов " "шестьсот семьдесят квинтиллионов " "пятьсот семьдесят четыре квадриллиона " "семьсот тридцать девять триллионов " "семьсот сорок восемь миллиардов " "четыреста семьдесят миллионов " "девятьсот пятнадцать тысяч " "семьдесят два") # infinity self.assertEqual( pronounce_number(sys.float_info.max * 2), "бесконечность") self.assertEqual( pronounce_number(float("inf")), "бесконечность") self.assertEqual( pronounce_number(float("-inf")), "минус бесконечность") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "первый") self.assertEqual(pronounce_number(10, ordinals=True), "десятый") self.assertEqual(pronounce_number(15, ordinals=True), "пятнадцатый") self.assertEqual(pronounce_number(20, ordinals=True), "двадцатый") self.assertEqual(pronounce_number(27, ordinals=True), "двадцать седьмой") self.assertEqual(pronounce_number(30, ordinals=True), "тридцатый") self.assertEqual(pronounce_number(33, ordinals=True), "тридцать третий") self.assertEqual(pronounce_number(100, ordinals=True), "сотый") self.assertEqual(pronounce_number(1000, ordinals=True), "тысячный") self.assertEqual(pronounce_number(10000, ordinals=True), "десятитысячный") self.assertEqual(pronounce_number(18691, ordinals=True), "восемнадцать тысяч шестьсот девяносто первый") self.assertEqual(pronounce_number(1567, ordinals=True), "тысяча пятьсот шестьдесят седьмой") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, ordinals=True), "один точка шесть семь два на десять в минус " "двадцать седьмой степени") self.assertEqual(pronounce_number(1e6, ordinals=True), "миллионный") self.assertEqual(pronounce_number(2e6, ordinals=True), "двухмиллионный") self.assertEqual(pronounce_number(2e6, ordinals=True, short_scale=False), "двухмиллионный") self.assertEqual(pronounce_number(3e6, ordinals=True), "трёхмиллионный") self.assertEqual(pronounce_number(4e6, ordinals=True), "четырёхмиллионный") self.assertEqual(pronounce_number(18e6, ordinals=True), "восемнадцатимиллионный") self.assertEqual(pronounce_number(18e12, ordinals=True, short_scale=False), "восемнадцатибиллионный") self.assertEqual(pronounce_number(18e12, ordinals=True), "восемнадцатитриллионный") self.assertEqual(pronounce_number(18e18, ordinals=True, short_scale=False), "восемнадцатитриллионный") class TestNiceDateFormat(unittest.TestCase): @classmethod def setUpClass(cls): # Read date_time_test.json files for test data cls.test_config = {} p = Path(date_time_format.config_path) for sub_dir in [x for x in p.iterdir() if x.is_dir()]: if (sub_dir / 'date_time_test.json').exists(): print("Loading test for " + str(sub_dir / 'date_time_test.json')) with (sub_dir / 'date_time_test.json').open() as f: cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt), nice_time(dt, speech=True, use_24hour=True, use_ampm=False)) self.assertEqual(nice_time(dt, use_24hour=False), "час двадцать два") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "час двадцать два дня") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:22 дня") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "тринадцать двадцать два") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "тринадцать двадцать два") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "час") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "час дня") self.assertEqual(nice_time(dt, use_24hour=False, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:00 дня") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "тринадцать ровно") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "тринадцать ровно") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "час ноль два") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "час ноль два дня") self.assertEqual(nice_time(dt, use_24hour=False, speech=False), "1:02") self.assertEqual(nice_time(dt, use_24hour=False, speech=False, use_ampm=True), "1:02 дня") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "тринадцать ноль два") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "тринадцать ноль два") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "двенадцать ноль два") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "двенадцать ноль два ночи") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "12:02 ночи") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "ноль ноль ноль два") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "ноль ноль ноль два") dt = datetime.datetime(2018, 2, 8, 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "час ноль два") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "час ноль два ночи") self.assertEqual(nice_time(dt, speech=False, use_24hour=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), "1:02 ночи") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "ноль один ноль два") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "ноль один ноль два") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "двенадцать с четвертью") self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "двенадцать с четвертью дня") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), "пять с половиной утра") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_24hour=False), "без четверти два") def test_nice_date(self): lang = "ru-ru" i = 1 while (self.test_config[lang].get('test_nice_date') and self.test_config[lang]['test_nice_date'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_date'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is date ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_date(dt, lang=lang, now=now)) i = i + 1 # test all days in a year for all languages, # that some output is produced # for lang in self.test_config: for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3, tzinfo=default_timezone()) + datetime.timedelta(n) for n in range(368)): self.assertTrue(len(nice_date(dt, lang=lang)) > 0) def test_nice_date_time(self): lang = "ru-ru" i = 1 while (self.test_config[lang].get('test_nice_date_time') and self.test_config[lang]['test_nice_date_time'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_date_time'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5]) print('Testing for ' + lang + ' that ' + str(dt) + ' is date time ' + p['assertEqual']) self.assertEqual( p['assertEqual'], nice_date_time( dt, lang=lang, now=now, use_24hour=ast.literal_eval(p['use_24hour']), use_ampm=ast.literal_eval(p['use_ampm']))) i = i + 1 def test_nice_year(self): lang = "ru-ru" i = 1 while (self.test_config[lang].get('test_nice_year') and self.test_config[lang]['test_nice_year'].get(str(i).encode('utf8'))): p = self.test_config[lang]['test_nice_year'][str(i)] dp = ast.literal_eval(p['datetime_param']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is year ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_year( dt, lang=lang, bc=ast.literal_eval(p['bc']))) i = i + 1 # Test all years from 0 to 9999 for all languages, # that some output is produced print("Test all years in " + lang) for i in range(1, 9999): dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertTrue(len(nice_year(dt, lang=lang)) > 0) # Looking through the date sequence can be helpful def test_nice_duration(self): self.assertEqual(nice_duration(1), "одна секунда") self.assertEqual(nice_duration(3), "три секунды") self.assertEqual(nice_duration(1, speech=False), "0:01") self.assertEqual(nice_duration(61), "одна минута одна секунда") self.assertEqual(nice_duration(61, speech=False), "1:01") self.assertEqual(nice_duration(5000), "один час двадцать три минуты двадцать секунд") self.assertEqual(nice_duration(5000, speech=False), "1:23:20") self.assertEqual(nice_duration(50000), "тринадцать часов пятьдесят три минуты двадцать секунд") self.assertEqual(nice_duration(50000, speech=False), "13:53:20") self.assertEqual(nice_duration(500000), "пять дней восемнадцать часов пятьдесят три минуты двадцать секунд") # nopep8 self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), speech=False), "5d 18:53:20") def test_join(self): self.assertEqual(join_list(None, "и"), "") self.assertEqual(join_list([], "и"), "") self.assertEqual(join_list(["a"], "и"), "a") self.assertEqual(join_list(["a", "b"], "и"), "a и b") self.assertEqual(join_list(["a", "b"], "или"), "a или b") self.assertEqual(join_list(["a", "b", "c"], "и"), "a, b и c") self.assertEqual(join_list(["a", "b", "c"], "или"), "a, b или c") self.assertEqual( join_list(["a", "b", "c"], "или", ";"), "a; b или c") self.assertEqual( join_list(["a", "b", "c", "d"], "или"), "a, b, c или d") self.assertEqual(join_list([1, "b", 3, "d"], "или"), "1, b, 3 или d") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_sl.py000066400000000000000000000645751426211343400224300ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import unittest import datetime import ast import sys from pathlib import Path from lingua_franca import get_default_lang, set_default_lang from lingua_franca.format import nice_number from lingua_franca.format import nice_time from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time from lingua_franca.format import nice_year from lingua_franca.format import nice_duration from lingua_franca.format import pronounce_number from lingua_franca.format import date_time_format from lingua_franca.format import join_list from lingua_franca.time import default_timezone NUMBERS_FIXTURE_SL = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', 0.5: '1 polovica', 1.333: '1 in 1 tretjina', 2.666: '2 in 2 tretjini', 0.25: '1 četrtina', 1.25: '1 in 1 četrtina', 0.75: '3 četrtine', 1.75: '1 in 3 četrtine', 3.4: '3 in 2 petini', 16.8333: '16 in 5 šestin', 12.5714: '12 in 4 sedmine', 9.625: '9 in 5 osmin', 6.777: '6 in 7 devetin', 3.1: '3 in 1 desetina', 2.272: '2 in 3 enajstine', 5.583: '5 in 7 dvanajstin', 8.384: '8 in 5 trinajstin', 0.071: '1 štirinajstina', 6.466: '6 in 7 petnajstin', 8.312: '8 in 5 šestnajstin', 2.176: '2 in 3 sedemnajstine', 200.722: '200 in 13 osemnajstin', 7.421: '7 in 8 devetnajstin', 0.05: '1 dvajsetina' } class TestNiceNumberFormat(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("sl-si") def tearDown(self): if self.old_lang: set_default_lang(self.old_lang) def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_SL.items(): self.assertEqual(nice_number(number), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), '5 in 1 polovica', 'should format 5.5 as 5 in 1 polovica not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), '2.333', 'should format 2.333 as 2.333 not {}'.format( nice_number(2.333, denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) class TestPronounceNumber(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("sl-si") def tearDown(self): if self.old_lang: set_default_lang(self.old_lang) def test_convert_int(self): self.assertEqual(pronounce_number(0), "nič") self.assertEqual(pronounce_number(1), "ena") self.assertEqual(pronounce_number(10), "deset") self.assertEqual(pronounce_number(15), "petnajst") self.assertEqual(pronounce_number(20), "dvajset") self.assertEqual(pronounce_number(27), "sedemindvajset") self.assertEqual(pronounce_number(30), "trideset") self.assertEqual(pronounce_number(33), "triintrideset") def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "minus ena") self.assertEqual(pronounce_number(-10), "minus deset") self.assertEqual(pronounce_number(-15), "minus petnajst") self.assertEqual(pronounce_number(-20), "minus dvajset") self.assertEqual(pronounce_number(-27), "minus sedemindvajset") self.assertEqual(pronounce_number(-30), "minus trideset") self.assertEqual(pronounce_number(-33), "minus triintrideset") def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "nič celih nič pet") self.assertEqual(pronounce_number(-0.05), "minus nič celih nič pet") self.assertEqual(pronounce_number(1.234), "ena cela dve tri") self.assertEqual(pronounce_number(21.234), "enaindvajset celih dve tri") self.assertEqual(pronounce_number(21.234, places=1), "enaindvajset celih dve") self.assertEqual(pronounce_number(21.234, places=0), "enaindvajset") self.assertEqual(pronounce_number(21.234, places=3), "enaindvajset celih dve tri štiri") self.assertEqual(pronounce_number(21.234, places=4), "enaindvajset celih dve tri štiri") self.assertEqual(pronounce_number(21.234, places=5), "enaindvajset celih dve tri štiri") self.assertEqual(pronounce_number(-1.234), "minus ena cela dve tri") self.assertEqual(pronounce_number(-21.234), "minus enaindvajset celih dve tri") self.assertEqual(pronounce_number(-21.234, places=1), "minus enaindvajset celih dve") self.assertEqual(pronounce_number(-21.234, places=0), "minus enaindvajset") self.assertEqual(pronounce_number(-21.234, places=3), "minus enaindvajset celih dve tri štiri") self.assertEqual(pronounce_number(-21.234, places=4), "minus enaindvajset celih dve tri štiri") self.assertEqual(pronounce_number(-21.234, places=5), "minus enaindvajset celih dve tri štiri") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100), "sto") self.assertEqual(pronounce_number(666), "šeststo šestinšestdeset") self.assertEqual(pronounce_number( 1456), "tisoč štiristo šestinpetdeset") self.assertEqual(pronounce_number(103254654), "sto trije milijoni " "dvesto štiriinpetdeset " "tisoč šeststo " "štiriinpetdeset") self.assertEqual(pronounce_number(1512457), "milijon petsto dvanajst" " tisoč štiristo " "sedeminpetdeset") self.assertEqual(pronounce_number(209996), "dvesto devet tisoč " "devetsto šestindevetdeset") def test_convert_scientific_notation(self): self.assertEqual(pronounce_number(0, scientific=True), "nič") self.assertEqual(pronounce_number(33, scientific=True), "tri cele tri krat deset na ena") self.assertEqual(pronounce_number(299792458, scientific=True), "dve celi devet devet krat deset na osem") self.assertEqual(pronounce_number(299792458, places=6, scientific=True), "dve celi devet devet sedem devet dve pet " "krat deset na osem") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "ena cela šest sedem dve krat deset na " "minus sedemindvajset") def test_auto_scientific_notation(self): self.assertEqual( pronounce_number(1.1e-150), "ena cela ena krat deset na " "minus sto petdeset") # value is platform dependent so better not use in tests? # self.assertEqual( # pronounce_number(sys.float_info.min), "dve celi dve dve krat " # "deset na minus tristo osem") # self.assertEqual( # pronounce_number(sys.float_info.max), "ena cela sedem devet krat " # "deset na tristo osem") def test_large_numbers(self): self.assertEqual( pronounce_number(299792458, short_scale=True), "dvesto devetindevetdeset milijonov sedemsto " "dvaindevetdeset tisoč štiristo oseminpetdeset") self.assertEqual( pronounce_number(299792458, short_scale=False), "dvesto devetindevetdeset milijonov sedemsto " "dvaindevetdeset tisoč štiristo oseminpetdeset") self.assertEqual( pronounce_number(100034000000299792458, short_scale=True), "sto kvintilijonov štiriintrideset kvadrilijonov " "dvesto devetindevetdeset milijonov sedemsto " "dvaindevetdeset tisoč štiristo oseminpetdeset") self.assertEqual( pronounce_number(100034000000299792458, short_scale=False), "sto trilijonov štiriintrideset bilijard " "dvesto devetindevetdeset milijonov sedemsto " "dvaindevetdeset tisoč štiristo oseminpetdeset") self.assertEqual( pronounce_number(10000000000, short_scale=True), "deset bilijonov") self.assertEqual( pronounce_number(1000000000000, short_scale=True), "trilijon") # TODO maybe beautify this self.assertEqual( pronounce_number(1000001, short_scale=True), "milijon ena") self.assertEqual(pronounce_number(95505896639631893), "petindevetdeset kvadrilijonov petsto pet trilijonov " "osemsto šestindevetdeset bilijonov šeststo devetintrideset " "milijonov šeststo enaintrideset tisoč osemsto triindevetdeset") self.assertEqual(pronounce_number(95505896639631893, short_scale=False), "petindevetdeset bilijard osemsto " "šestindevetdeset milijard šeststo enaintrideset " "tisoč osemsto triindevetdeset") # TODO floating point rounding issues might happen # Automatic switch to scientific notation because such big numbers are not (yet) supported self.assertEqual(pronounce_number(1.9874522571e80, places=9), "ena cela devet osem sedem štiri " "pet dve krat deset na osemdeset") self.assertEqual(pronounce_number(1.00000000000000001e150), "ena krat deset na sto petdeset") # infinity self.assertEqual( pronounce_number(sys.float_info.max * 2), "neskončno") self.assertEqual( pronounce_number(float("inf")), "neskončno") self.assertEqual( pronounce_number(float("-inf")), "minus neskončno") def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "prvi") self.assertEqual(pronounce_number(10, ordinals=True), "deseti") self.assertEqual(pronounce_number(15, ordinals=True), "petnajsti") self.assertEqual(pronounce_number(20, ordinals=True), "dvajseti") self.assertEqual(pronounce_number( 27, ordinals=True), "sedemindvajseti") self.assertEqual(pronounce_number(30, ordinals=True), "trideseti") self.assertEqual(pronounce_number(33, ordinals=True), "triintrideseti") self.assertEqual(pronounce_number(100, ordinals=True), "stoti") self.assertEqual(pronounce_number(1000, ordinals=True), "tisoči") self.assertEqual(pronounce_number(10000, ordinals=True), "desettisoči") self.assertEqual(pronounce_number(18691, ordinals=True), "osemnajsttisočšeststoenaindevetdeseti") self.assertEqual(pronounce_number(1567, ordinals=True), "tisočpetstosedeminšestdeseti") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True, ordinals=True), "ena cela šest sedem dve krat " "deset na minus sedemindvajseti") self.assertEqual(pronounce_number(18e6, ordinals=True), "osemnajstmilijonti") self.assertEqual(pronounce_number(18e12, ordinals=True, short_scale=False), "osemnajstbilijonti") self.assertEqual(pronounce_number(18e12, ordinals=True), "osemnajsttrilijonti") self.assertEqual(pronounce_number(18e18, ordinals=True, short_scale=False), "osemnajsttrilijonti") class TestNiceDateFormat(unittest.TestCase): def setUp(self): self.old_lang = get_default_lang() set_default_lang("sl-si") def tearDown(self): if self.old_lang: set_default_lang(self.old_lang) @classmethod def setUpClass(cls): # Read date_time_test.json files for test data language = "sl-si" config = date_time_format.config_path + "/" + language + "/date_time_test.json" cls.test_config = {} with open(config, encoding="utf8") as file: cls.test_config[language] = json.loads(file.read()) def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt), nice_time(dt, "sl-si", True, False, False)) self.assertEqual(nice_time(dt), "dvaindvajset čez ena") self.assertEqual(nice_time(dt, use_ampm=True), "dvaindvajset čez ena p.m.") self.assertEqual(nice_time(dt, speech=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:22 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "trinajst dvaindvajset") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "trinajst dvaindvajset") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "ena") self.assertEqual(nice_time(dt, use_ampm=True), "ena p.m.") self.assertEqual(nice_time(dt, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:00 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "trinajst nič nič") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "trinajst nič nič") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dve čez ena") self.assertEqual(nice_time(dt, use_ampm=True), "dve čez ena p.m.") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 PM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "trinajst nič dve") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "trinajst nič dve") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dve čez dvanajst") self.assertEqual(nice_time(dt, use_ampm=True), "dve čez dvanajst a.m.") self.assertEqual(nice_time(dt, speech=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "nič nič dve") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "nič nič dve") dt = datetime.datetime(2017, 1, 31, 20, 40, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dvajset do devetih") self.assertEqual(nice_time(dt, use_ampm=True), "dvajset do devetih p.m.") dt = datetime.datetime(2017, 1, 31, 0, 58, 40, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dve do enih") self.assertEqual(nice_time(dt, use_ampm=True), "dve do enih a.m.") dt = datetime.datetime(2018, 2, 8, 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "dve čez ena") self.assertEqual(nice_time(dt, use_ampm=True), "dve čez ena a.m.") self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), "1:02 AM") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), "ena nič dve") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), "ena nič dve") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "petnajst čez dvanajst") self.assertEqual(nice_time(dt, use_ampm=True), "petnajst čez dvanajst p.m.") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "petnajst čez ena a.m.") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "petnajst do dveh a.m.") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "pol šestih a.m.") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "petnajst do dveh") def test_nice_date(self): for lang in self.test_config: i = 1 while (self.test_config[lang].get('test_nice_date') and self.test_config[lang]['test_nice_date'].get(str(i))): p = self.test_config[lang]['test_nice_date'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is date ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_date(dt, lang=lang, now=now)) i = i + 1 def test_nice_date_time(self): for lang in self.test_config: i = 1 while (self.test_config[lang].get('test_nice_date_time') and self.test_config[lang]['test_nice_date_time'].get(str(i))): p = self.test_config[lang]['test_nice_date_time'][str(i)] dp = ast.literal_eval(p['datetime_param']) np = ast.literal_eval(p['now']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) now = None if not np else datetime.datetime( np[0], np[1], np[2], np[3], np[4], np[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is date time ' + p['assertEqual']) self.assertEqual( p['assertEqual'], nice_date_time( dt, lang=lang, now=now, use_24hour=ast.literal_eval(p['use_24hour']), use_ampm=ast.literal_eval(p['use_ampm']))) i = i + 1 def test_nice_year(self): for lang in self.test_config: i = 1 while (self.test_config[lang].get('test_nice_year') and self.test_config[lang]['test_nice_year'].get(str(i))): p = self.test_config[lang]['test_nice_year'][str(i)] dp = ast.literal_eval(p['datetime_param']) dt = datetime.datetime( dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], tzinfo=default_timezone()) print('Testing for ' + lang + ' that ' + str(dt) + ' is year ' + p['assertEqual']) self.assertEqual(p['assertEqual'], nice_year( dt, lang=lang, bc=ast.literal_eval(p['bc']))) i = i + 1 # Test all years from 0 to 9999 for all languages, # that some output is produced for lang in self.test_config: print("Test all years in " + lang) for i in range(1, 9999): dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertTrue(len(nice_year(dt, lang=lang)) > 0) # Looking through the date sequence can be helpful # print(nice_year(dt, lang=lang)) def test_nice_duration(self): # TODO implement better plural support for nice_duration # Correct results are in comments self.assertEqual(nice_duration(1), "ena sekunda") self.assertEqual(nice_duration(2), "dve sekund") # dve sekundi self.assertEqual(nice_duration(3), "tri sekund") # tri sekunde self.assertEqual(nice_duration(4), "štiri sekund") # štiri sekunde self.assertEqual(nice_duration(5), "pet sekund") self.assertEqual(nice_duration(6), "šest sekund") self.assertEqual(nice_duration(1, speech=False), "0:01") self.assertEqual(nice_duration(61), "ena minuta ena sekunda") self.assertEqual(nice_duration(61, speech=False), "1:01") self.assertEqual(nice_duration(5000), "ena ura triindvajset minut dvajset sekund") self.assertEqual(nice_duration(5000, speech=False), "1:23:20") self.assertEqual(nice_duration(50000), "trinajst ur triinpetdeset minut dvajset sekund") self.assertEqual(nice_duration(50000, speech=False), "13:53:20") self.assertEqual(nice_duration(500000), "pet dni osemnajst ur triinpetdeset minut dvajset sekund") # nopep8 self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), speech=False), "5d 18:53:20") def test_join(self): self.assertEqual(join_list(None, "in"), "") self.assertEqual(join_list([], "in"), "") self.assertEqual(join_list(["a"], "in"), "a") self.assertEqual(join_list(["a", "b"], "in"), "a in b") self.assertEqual(join_list(["a", "b"], "ali"), "a ali b") self.assertEqual(join_list(["a", "b", "c"], "in"), "a, b in c") self.assertEqual(join_list(["a", "b", "c"], "ali"), "a, b ali c") self.assertEqual(join_list(["a", "b", "c"], "ali", ";"), "a; b ali c") self.assertEqual( join_list(["a", "b", "c", "d"], "ali"), "a, b, c ali d") self.assertEqual(join_list([1, "b", 3, "d"], "ali"), "1, b, 3 ali d") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_format_sv.py000066400000000000000000000424751426211343400224350ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest import datetime from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_number, nice_time, pronounce_number from lingua_franca.lang.format_sv import pronounce_ordinal_sv from lingua_franca.time import default_timezone def setUpModule(): load_language('sv') set_default_lang('sv') def TearDownModule(): unload_language('sv') # fractions are not capitalized for now NUMBERS_FIXTURE_sv = { 1.435634: '1.436', 2: '2', 5.0: '5', 1234567890: '1234567890', 12345.67890: '12345.679', 0.027: '0.027', 0.5: 'en halv', 1.333: '1 och en tredjedel', 2.666: '2 och 2 tredjedelar', 0.25: 'en fjärdedel', 1.25: '1 och en fjärdedel', 0.75: '3 fjärdedelar', 1.75: '1 och 3 fjärdedelar', 3.4: '3 och 2 femtedelar', 16.8333: '16 och 5 sjättedelar', 12.5714: '12 och 4 sjundedelar', 9.625: '9 och 5 åttondelar', 6.777: '6 och 7 niondelar', 3.1: '3 och en tiondel', 2.272: '2 och 3 elftedelar', 5.583: '5 och 7 tolftedelar', 8.384: '8 och 5 trettondelar', 0.071: 'en fjortondel', 6.466: '6 och 7 femtondelar', 8.312: '8 och 5 sextondelar', 2.176: '2 och 3 sjuttondelar', 200.722: '200 och 13 artondelar', 7.421: '7 och 8 nittondelar', 0.05: 'en tjugondel' } # class TestNiceResponse(unittest.TestCase): # def test_replace_ordinal(self): # self.assertEqual(nice_response_sv("det er den 31. maj"), # "det er den enogtredifte maj") # self.assertEqual(nice_response_sv("Det begynder den 31. maj"), # "Det begynder den enogtrefte maj") # self.assertEqual(nice_response_sv("den 31. mai"), # "den enogtrefte maj") # self.assertEqual(nice_response_sv("10 ^ 2"), "ti to") class TestNiceNumberFormat(unittest.TestCase): def test_convert_float_to_nice_number(self): for number, number_str in NUMBERS_FIXTURE_sv.items(): self.assertEqual(nice_number(number, lang="sv-se"), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number, lang="sv-se"))) def test_specify_danominator(self): self.assertEqual(nice_number(5.5, lang="sv-se", denominators=[1, 2, 3]), '5 och en halv', 'should format 5.5 as 5 und ein halb not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, lang="sv-se", denominators=[1, 2]), '2.333', 'should format 2,333 as 2.333 not {}'.format( nice_number(2.333, lang="sv-se", denominators=[1, 2]))) def test_no_speech(self): self.assertEqual(nice_number(6.777, speech=False), '6 7/9', 'should format 6.777 as 6 7/9 not {}'.format( nice_number(6.777, lang="sv-se", speech=False))) self.assertEqual(nice_number(6.0, speech=False), '6', 'should format 6.0 as 6 not {}'.format( nice_number(6.0, lang="sv-se", speech=False))) class TestPronounceOrdinal(unittest.TestCase): def test_convert_int_sv(self): self.assertEqual(pronounce_ordinal_sv(0), "noll") self.assertEqual(pronounce_ordinal_sv(1), "första") self.assertEqual(pronounce_ordinal_sv(3), "tredje") self.assertEqual(pronounce_ordinal_sv(5), "femte") self.assertEqual(pronounce_ordinal_sv(21), "tjugoförsta") self.assertEqual(pronounce_ordinal_sv(2000), "tvåtusende") self.assertEqual(pronounce_ordinal_sv(1000), "ettusende") # self.assertEqual(pronounce_ordinal_sv(123456), # "ethundredetreogtyvetusindefirehundredeseksog\ # halvtresende") class TestPronounceNumber(unittest.TestCase): def test_convert_int_sv(self): self.assertEqual(pronounce_number(123456789123456789, lang="sv-se"), "etthundratjugotrebiljarder " "fyrahundrafemtiosexbiljoner " "sjuhundraåttioniomiljarder " "etthundratjugotremiljoner " "fyrahundrafemtiosextusen " "sjuhundraåttionio") self.assertEqual(pronounce_number(1, lang="sv-se"), "en") self.assertEqual(pronounce_number(10, lang="sv-se"), "tio") self.assertEqual(pronounce_number(15, lang="sv-se"), "femton") self.assertEqual(pronounce_number(20, lang="sv-se"), "tjugo") self.assertEqual(pronounce_number(27, lang="sv-se"), "tjugosju") self.assertEqual(pronounce_number(30, lang="sv-se"), "trettio") self.assertEqual(pronounce_number(33, lang="sv-se"), "trettiotre") self.assertEqual(pronounce_number(71, lang="sv-se"), "sjuttioen") self.assertEqual(pronounce_number(80, lang="sv-se"), "åttio") self.assertEqual(pronounce_number(74, lang="sv-se"), "sjuttiofyra") self.assertEqual(pronounce_number(79, lang="sv-se"), "sjuttionio") self.assertEqual(pronounce_number(91, lang="sv-se"), "nittioen") self.assertEqual(pronounce_number(97, lang="sv-se"), "nittiosju") self.assertEqual(pronounce_number(300, lang="sv-se"), "trehundra") self.assertEqual(pronounce_number(10000001, lang="sv-se"), "tiomiljoner en") def test_convert_negative_int_sv(self): self.assertEqual(pronounce_number(-1, lang="sv-se"), "minus en") self.assertEqual(pronounce_number(-10, lang="sv-se"), "minus tio") self.assertEqual(pronounce_number(-15, lang="sv-se"), "minus femton") self.assertEqual(pronounce_number(-20, lang="sv-se"), "minus tjugo") self.assertEqual(pronounce_number(-27, lang="sv-se"), "minus tjugosju") self.assertEqual(pronounce_number(-30, lang="sv-se"), "minus trettio") self.assertEqual(pronounce_number(-33, lang="sv-se"), "minus trettiotre") def test_convert_dacimals_sv(self): self.assertEqual(pronounce_number(1.1, lang="sv-se", places=1), "en komma en") self.assertEqual(pronounce_number(1.234, lang="sv-se"), "en komma två tre") self.assertEqual(pronounce_number(21.234, lang="sv-se"), "tjugoen komma två tre") self.assertEqual(pronounce_number(21.234, lang="sv-se", places=1), "tjugoen komma två") self.assertEqual(pronounce_number(21.234, lang="sv-se", places=0), "tjugoen") self.assertEqual(pronounce_number(21.234, lang="sv-se", places=3), "tjugoen komma två tre fyra") self.assertEqual(pronounce_number(21.234, lang="sv-se", places=4), "tjugoen komma två tre fyra noll") self.assertEqual(pronounce_number(21.234, lang="sv-se", places=5), "tjugoen komma två tre fyra noll noll") self.assertEqual(pronounce_number(-1.234, lang="sv-se"), "minus en komma två tre") self.assertEqual(pronounce_number(-21.234, lang="sv-se"), "minus tjugoen komma två tre") self.assertEqual(pronounce_number(-21.234, lang="sv-se", places=1), "minus tjugoen komma två") self.assertEqual(pronounce_number(-21.234, lang="sv-se", places=0), "minus tjugoen") self.assertEqual(pronounce_number(-21.234, lang="sv-se", places=3), "minus tjugoen komma två tre fyra") self.assertEqual(pronounce_number(-21.234, lang="sv-se", places=4), "minus tjugoen komma två tre fyra noll") self.assertEqual(pronounce_number(-21.234, lang="sv-se", places=5), "minus tjugoen komma två tre fyra noll noll") # def nice_time(dt, lang="sv-se", speech=True, use_24hour=False, # use_ampm=False): class TestNiceDateFormat_sv(unittest.TestCase): def test_convert_times_sv(self): dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "tjugotvå minuter över ett") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "tjugotvå minuter över ett på eftermiddagen") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "01:22") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "01:22 PM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "tretton tjugotvå") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "tretton tjugotvå") dt = datetime.datetime(2017, 1, 31, 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "ett") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "ett på eftermiddagen") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "01:00") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "01:00 PM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "13:00") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "tretton") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "tretton") dt = datetime.datetime(2017, 1, 31, 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "två minuter över ett") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "två minuter över ett på eftermiddagen") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "01:02") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "01:02 PM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "tretton noll två") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "tretton noll två") dt = datetime.datetime(2017, 1, 31, 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "två minuter över tolv") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "två minuter över tolv på natten") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "12:02") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "12:02 AM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "noll noll två") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "noll noll två") dt = datetime.datetime(2017, 1, 31, 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "kvart över tolv") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "kvart över tolv på eftermiddagen") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "12:15") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "12:15 PM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "12:15") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "12:15") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "tolv femton") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "tolv femton") dt = datetime.datetime(2017, 1, 31, 19, 40, 49, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "tjugo minuter i åtta") self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "tjugo minuter i åtta på kvällen") self.assertEqual(nice_time(dt, lang="sv-se", speech=False), "07:40") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_ampm=True), "07:40 PM") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True), "19:40") self.assertEqual(nice_time(dt, lang="sv-se", speech=False, use_24hour=True, use_ampm=True), "19:40") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=True), "nitton fyrtio") self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True, use_ampm=False), "nitton fyrtio") dt = datetime.datetime(2017, 1, 31, 1, 15, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se", use_24hour=True), "ett femton") dt = datetime.datetime(2017, 1, 31, 1, 35, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "tjugofem minuter i två") dt = datetime.datetime(2017, 1, 31, 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "kvart i två") dt = datetime.datetime(2017, 1, 31, 4, 50, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "tio i fem") dt = datetime.datetime(2017, 1, 31, 5, 55, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se"), "fem i sex") dt = datetime.datetime(2017, 1, 31, 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, lang="sv-se", use_ampm=True), "halv sex på morgonen") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_localizer.py000066400000000000000000000216631426211343400224150ustar00rootroot00000000000000import unittest from sys import version import lingua_franca import lingua_franca.parse import lingua_franca.format from lingua_franca.internal import localized_function, _SUPPORTED_LANGUAGES def unload_all_languages(): """ These tests call this function a LOT. That's as opposed to forcing your test util to run them in order. Sadly, spamming this function is easier and probably less onerous for most devs. """ lingua_franca._set_active_langs([]) def setUpModule(): unload_all_languages() def tearDownModule(): unload_all_languages() class TestException(unittest.TestCase): def setUpClass(): unload_all_languages() def tearDownClass(): unload_all_languages() def test_must_load_language(self): unload_all_languages() self.assertRaises(ModuleNotFoundError, lingua_franca.parse.extract_number, 'one') def test_run_own_code_on(self): lingua_franca.load_language('en') # nice_number() has a run_own_code_on for unrecognized languages, # because backwards compatibility requires it to fall back on # str(input_value) rather than failing loudly # # 'cz' is not a supported language, so the function will raise # an UnsupportedLanguageError, but nice_number() is decorated with # @localized_function(run_own_code_on=[UnsupportedLanguageError]) self.assertEqual(lingua_franca.format.nice_number(123, lang='cz'), "123") self.assertEqual(lingua_franca.format.nice_number(123.45, speech=False, lang='cz'), "123.45") # It won't intercept other exceptions, though! with self.assertRaises(ModuleNotFoundError): unload_all_languages() lingua_franca.format.nice_number(123.45) # ModuleNotFoundError: No language module loaded. with self.assertRaises(ValueError): @localized_function("not an error type") def foo_must_fail(): pass with self.assertRaises(ValueError): @localized_function(print) def bar_must_fail_too(): pass with self.assertRaises(ValueError): @localized_function([1, 2, 3]) def baz_must_fail_as_well(): pass def test_type_error(self): with self.assertRaises(TypeError): lingua_franca.load_language(12) class TestDeprecation(unittest.TestCase): def test_deprecate_explicit_null_lang(self): unload_all_languages() lingua_franca.set_default_lang('en') with self.assertWarns(DeprecationWarning): self.assertEqual( lingua_franca.parse.extract_number("one", lang=None), 1 ) unload_all_languages() def test_deprecate_positional_null_lang(self): unload_all_languages() lingua_franca.set_default_lang('en') with self.assertWarns(DeprecationWarning): self.assertEqual( lingua_franca.parse.extract_number("one", True, False, None), 1 ) unload_all_languages() class TestLanguageLoading(unittest.TestCase): def test_load_on_demand(self): unload_all_languages() lingua_franca.load_language("en") lingua_franca.config.load_langs_on_demand = True self.assertEqual(lingua_franca.parse.extract_number("one", lang="en"), 1) self.assertEqual(lingua_franca.parse.extract_number("uno", lang="es"), 1) lingua_franca.config.load_langs_on_demand = False # English should still be loaded, but not Spanish self.assertEqual(lingua_franca.parse.extract_number("one", lang="en"), 1) with self.assertRaises(ModuleNotFoundError): lingua_franca.parse.extract_number("uno", lang="es") unload_all_languages() def test_load_language(self): lingua_franca.load_language('en') # Verify that English is loaded and, since it's the only language # we've loaded, also the default. self.assertEqual(lingua_franca.get_default_lang(), 'en') # Verify that English's default full code is 'en-us' self.assertEqual(lingua_franca.get_full_lang_code('en'), 'en-us') # Verify that this is also our current full code self.assertEqual(lingua_franca.get_default_loc(), 'en-us') self.assertFalse('es' in lingua_franca.get_active_langs()) # Verify that unloaded languages can't be invoked explicitly self.assertRaises(ModuleNotFoundError, lingua_franca.parse.extract_number, 'uno', lang='es') unload_all_languages() def test_auto_default_language(self): lingua_franca.load_language('en') # Load two languages, ensure first is default lingua_franca.load_languages(['en', 'es']) self.assertEqual(lingua_franca.get_default_lang(), 'en') self.assertEqual(lingua_franca.parse.extract_number('one'), 1) unload_all_languages() def test_set_default_language(self): lingua_franca.load_languages(['es', 'en']) lingua_franca.set_default_lang('en') self.assertEqual(lingua_franca.get_default_lang(), 'en') unload_all_languages() with self.assertRaises(ValueError): lingua_franca.set_default_lang('foobar') def test_default_language_singles(self): # Load languages one at a time, ensure first is default self.assertEqual(lingua_franca.get_active_langs(), []) lingua_franca.load_language('en') self.assertEqual(lingua_franca.get_default_lang(), 'en') lingua_franca.load_language('es') self.assertEqual(lingua_franca.get_default_lang(), 'en') self.assertEqual(lingua_franca.parse.extract_number('dos'), False) self.assertEqual(lingua_franca.parse.extract_number('dos', lang='es'), 2) # Verify default language failover lingua_franca.unload_language('en') self.assertEqual(lingua_franca.get_default_lang(), 'es') unload_all_languages() def test_set_active_langs(self): unload_all_languages() lingua_franca.load_languages(['en', 'es']) self.assertEqual(lingua_franca.get_active_langs(), ['en', 'es']) lingua_franca._set_active_langs('es') self.assertEqual(lingua_franca.get_default_lang(), 'es') self.assertFalse('en' in lingua_franca.get_active_langs()) unload_all_languages() with self.assertRaises(TypeError): lingua_franca._set_active_langs(157.75) class TestLocalizerEdgeCases(unittest.TestCase): def test_pass_lang_code_positionally(self): lingua_franca.load_languages(['en', 'es']) self.assertEqual( lingua_franca.parse.extract_number("dos", True, False, 'es'), 2) unload_all_languages() def test_function_not_localized_error(self): lingua_franca.load_language('en') with self.assertRaises( lingua_franca.internal.FunctionNotLocalizedError): lingua_franca.parse.is_ordinal("twelve") unload_all_languages() class TestGetter(unittest.TestCase): def test_primary_lang_code(self): unload_all_languages() lingua_franca.load_language('en') # should default to the default lang with no input self.assertEqual(lingua_franca.get_primary_lang_code(), 'en') with self.assertRaises(TypeError): lingua_franca.get_primary_lang_code(12) unload_all_languages() def test_full_lang_code(self): unload_all_languages() self.assertEqual(lingua_franca.get_default_lang(), None) # Return default full lang code if no primary code is passed lingua_franca.load_language('en') self.assertEqual(lingua_franca.get_full_lang_code(), 'en-us') lingua_franca.load_language('es') lingua_franca.set_default_lang('es') self.assertEqual(lingua_franca.get_full_lang_code(), 'es-es') # Go look up the default full code for a provided primary code self.assertEqual(lingua_franca.get_full_lang_code('de'), 'de-de') # Fail on wrong type, or language not recognized with self.assertRaises(TypeError): lingua_franca.get_full_lang_code(12) # TODO remove this test and replace with the one below as soon as practical self.assertWarns(DeprecationWarning, lingua_franca.get_full_lang_code, "bob robertson") # TODO this is the version of the test we should use once invalid lang # params are deprecated: # # with self.assertRaises( # lingua_franca.internal.UnsupportedLanguageError): # lingua_franca.get_full_lang_code("bob robertson") unload_all_languages() lingua-franca-release-v0.4.3/test/test_parse.py000066400000000000000000001651301426211343400215410ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, timedelta from dateutil import tz from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.internal import FunctionNotLocalizedError from lingua_franca.time import default_timezone from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import fuzzy_match from lingua_franca.parse import get_gender from lingua_franca.parse import match_one from lingua_franca.parse import normalize def setUpModule(): # TODO spin off English tests load_language('en') set_default_lang('en') def tearDownModule(): unload_language('en') class TestFuzzyMatch(unittest.TestCase): def test_matches(self): self.assertTrue(fuzzy_match("you and me", "you and me") >= 1.0) self.assertTrue(fuzzy_match("you and me", "you") < 0.5) self.assertTrue(fuzzy_match("You", "you") > 0.5) self.assertTrue(fuzzy_match("you and me", "you") == fuzzy_match("you", "you and me")) self.assertTrue(fuzzy_match("you and me", "he or they") < 0.2) def test_match_one(self): # test list of choices choices = ['frank', 'kate', 'harry', 'henry'] self.assertEqual(match_one('frank', choices)[0], 'frank') self.assertEqual(match_one('fran', choices)[0], 'frank') self.assertEqual(match_one('enry', choices)[0], 'henry') self.assertEqual(match_one('katt', choices)[0], 'kate') # test dictionary of choices choices = {'frank': 1, 'kate': 2, 'harry': 3, 'henry': 4} self.assertEqual(match_one('frank', choices)[0], 1) self.assertEqual(match_one('enry', choices)[0], 4) class TestNormalize(unittest.TestCase): def test_articles(self): self.assertEqual(normalize("this is a test", remove_articles=True), "this is test") self.assertEqual(normalize("this is the test", remove_articles=True), "this is test") self.assertEqual(normalize("and another test", remove_articles=True), "and another test") self.assertEqual(normalize("this is an extra test", remove_articles=False), "this is an extra test") def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3) self.assertEqual(extract_number("sixth", ordinals=True), 6) # TODO a suite of tests needs to be written depending on outcome of # https://github.com/MycroftAI/lingua-franca/issues/152 # the tests bellow are flagged as problematic, some of those ARE BROKEN # for now this is considered undefined behaviour!!! # NOTE this test is returning the first number, which seems to be # the consensus regarding correct behaviour self.assertEqual(extract_number("Twenty two and Three Fifths", ordinals=True), 22) # TODO these should return the 1st number, not the last, ordinals # seem messed up, the rest of the codebase is returning first # number most likely tests bellow are bugs, i repeat, tests bellow # are testing FOR THE "WRONG" VALUE self.assertEqual(extract_number("sixth third", ordinals=True), 3) self.assertEqual(extract_number("third sixth", ordinals=True), 6) def test_extract_number_ambiguous(self): # test explicit ordinals self.assertEqual(extract_number("this is the 1st", ordinals=True), 1) self.assertEqual(extract_number("this is the 2nd", ordinals=False), 2) self.assertEqual(extract_number("this is the 3rd", ordinals=None), 3) self.assertEqual(extract_number("this is the 4th", ordinals=None), 4) self.assertEqual(extract_number( "this is the 7th test", ordinals=True), 7) self.assertEqual(extract_number( "this is the 7th test", ordinals=False), 7) self.assertTrue(extract_number("this is the nth test") is False) self.assertEqual(extract_number("this is the 1st test"), 1) self.assertEqual(extract_number("this is the 2nd test"), 2) self.assertEqual(extract_number("this is the 3rd test"), 3) self.assertEqual(extract_number("this is the 31st test"), 31) self.assertEqual(extract_number("this is the 32nd test"), 32) self.assertEqual(extract_number("this is the 33rd test"), 33) self.assertEqual(extract_number("this is the 34th test"), 34) # test non ambiguous ordinals self.assertEqual(extract_number("this is the first test", ordinals=True), 1) self.assertEqual(extract_number("this is the first test", ordinals=False), False) self.assertEqual(extract_number("this is the first test", ordinals=None), False) # test ambiguous ordinal/time unit self.assertEqual(extract_number("this is second test", ordinals=True), 2) self.assertEqual(extract_number("this is second test", ordinals=False), False) self.assertEqual(extract_number("remind me in a second", ordinals=True), 2) self.assertEqual(extract_number("remind me in a second", ordinals=False), False) self.assertEqual(extract_number("remind me in a second", ordinals=None), False) # test ambiguous ordinal/fractional self.assertEqual(extract_number("this is the third test", ordinals=True), 3.0) self.assertEqual(extract_number("this is the third test", ordinals=False), 1.0 / 3.0) self.assertEqual(extract_number("this is the third test", ordinals=None), False) self.assertEqual(extract_number("one third of a cup", ordinals=False), 1.0 / 3.0) self.assertEqual(extract_number("one third of a cup", ordinals=True), 3) self.assertEqual(extract_number("one third of a cup", ordinals=None), 1) # test plurals # NOTE plurals are never considered ordinals, but also not # considered explicit fractions self.assertEqual(extract_number("2 fifths", ordinals=True), 2) self.assertEqual(extract_number("2 fifth", ordinals=True), 5) self.assertEqual(extract_number("2 fifths", ordinals=False), 2/5) self.assertEqual(extract_number("2 fifths", ordinals=None), 2) self.assertEqual(extract_number("Twenty two and Three Fifths"), 22.6) # test multiple ambiguous self.assertEqual(extract_number("sixth third", ordinals=None), False) self.assertEqual(extract_number("thirty second", ordinals=False), 30) self.assertEqual(extract_number("thirty second", ordinals=None), 30) self.assertEqual(extract_number("thirty second", ordinals=True), 32) # TODO this test is imperfect, further discussion needed # "Sixth third" would probably refer to "the sixth instance of a third" # I dunno what should be returned here, don't think it should be cumulative. self.assertEqual(extract_number("sixth third", ordinals=False), 1 / 6 / 3) # test big numbers / short vs long scale self.assertEqual(extract_number("this is the billionth test", ordinals=True), 1e09) self.assertEqual(extract_number("this is the billionth test", ordinals=None), False) self.assertEqual(extract_number("this is the billionth test", ordinals=False), 1e-9) self.assertEqual(extract_number("this is the billionth test", ordinals=True, short_scale=False), 1e12) self.assertEqual(extract_number("this is the billionth test", ordinals=None, short_scale=False), False) self.assertEqual(extract_number("this is the billionth test", short_scale=False), 1e-12) # test the Nth one self.assertEqual(extract_number("the fourth one", ordinals=True), 4.0) self.assertEqual(extract_number("the thirty sixth one", ordinals=True), 36.0) self.assertEqual(extract_number( "you are the second one", ordinals=False), 1) self.assertEqual(extract_number( "you are the second one", ordinals=True), 2) self.assertEqual(extract_number("you are the 1st one", ordinals=None), 1) self.assertEqual(extract_number("you are the 2nd one", ordinals=None), 2) self.assertEqual(extract_number("you are the 3rd one", ordinals=None), 3) self.assertEqual(extract_number("you are the 8th one", ordinals=None), 8) def test_extract_number(self): self.assertEqual(extract_number("this is 2 test"), 2) self.assertEqual(extract_number("this is test number 4"), 4) self.assertEqual(extract_number("three cups"), 3) self.assertEqual(extract_number("1/3 cups"), 1.0 / 3.0) self.assertEqual(extract_number("quarter cup"), 0.25) self.assertEqual(extract_number("1/4 cup"), 0.25) self.assertEqual(extract_number("one fourth cup"), 0.25) self.assertEqual(extract_number("2/3 cups"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 cups"), 3.0 / 4.0) self.assertEqual(extract_number("1 and 3/4 cups"), 1.75) self.assertEqual(extract_number("1 cup and a half"), 1.5) self.assertEqual(extract_number("one cup and a half"), 1.5) self.assertEqual(extract_number("one and a half cups"), 1.5) self.assertEqual(extract_number("one and one half cups"), 1.5) self.assertEqual(extract_number("three quarter cups"), 3.0 / 4.0) self.assertEqual(extract_number("three quarters cups"), 3.0 / 4.0) self.assertEqual(extract_number("twenty two"), 22) self.assertEqual(extract_number( "Twenty two with a leading capital letter"), 22) self.assertEqual(extract_number( "twenty Two with Two capital letters"), 22) self.assertEqual(extract_number( "twenty Two with mixed capital letters"), 22) self.assertEqual(extract_number("two hundred"), 200) self.assertEqual(extract_number("nine thousand"), 9000) self.assertEqual(extract_number("six hundred sixty six"), 666) self.assertEqual(extract_number("two million"), 2000000) self.assertEqual(extract_number("two million five hundred thousand " "tons of spinning metal"), 2500000) self.assertEqual(extract_number("six trillion"), 6000000000000.0) self.assertEqual(extract_number("six trillion", short_scale=False), 6e+18) self.assertEqual(extract_number("one point five"), 1.5) self.assertEqual(extract_number("three dot fourteen"), 3.14) self.assertEqual(extract_number("zero point two"), 0.2) self.assertEqual(extract_number("billions of years older"), 1000000000.0) self.assertEqual(extract_number("billions of years older", short_scale=False), 1000000000000.0) self.assertEqual(extract_number("one hundred thousand"), 100000) self.assertEqual(extract_number("minus 2"), -2) self.assertEqual(extract_number("negative seventy"), -70) self.assertEqual(extract_number("thousand million"), 1000000000) # Verify non-power multiples of ten no longer discard # adjacent multipliers self.assertEqual(extract_number("twenty thousand"), 20000) self.assertEqual(extract_number("fifty million"), 50000000) # Verify smaller powers of ten no longer cause miscalculation of larger # powers of ten (see MycroftAI#86) self.assertEqual(extract_number("twenty billion three hundred million \ nine hundred fifty thousand six hundred \ seventy five point eight"), 20300950675.8) self.assertEqual(extract_number("nine hundred ninety nine million nine \ hundred ninety nine thousand nine \ hundred ninety nine point nine"), 999999999.9) # TODO why does "trillion" result in xxxx.0? self.assertEqual(extract_number("eight hundred trillion two hundred \ fifty seven"), 800000000000257.0) # TODO handle this case # self.assertEqual( # extract_number("6 dot six six six"), # 6.666) self.assertTrue(extract_number("The tennis player is fast") is False) self.assertTrue(extract_number("fraggle") is False) self.assertTrue(extract_number("fraggle zero") is not False) self.assertEqual(extract_number("fraggle zero"), 0) self.assertTrue(extract_number("grobo 0") is not False) self.assertEqual(extract_number("grobo 0"), 0) self.assertEqual(extract_number("a couple of beers"), 2) self.assertEqual(extract_number("a couple hundred beers"), 200) self.assertEqual(extract_number("a couple thousand beers"), 2000) self.assertEqual(extract_number("totally 100%"), 100) def test_extract_duration_en(self): self.assertEqual(extract_duration("10 seconds"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 minutes"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 hours"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 days"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 weeks"), (timedelta(weeks=25), "")) self.assertEqual(extract_duration("seven hours"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 seconds"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("eight and a half days thirty" " nine seconds"), (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("wake me up in three weeks, four" " hundred ninety seven days, and" " three hundred 91.6 seconds"), (timedelta(weeks=3, days=497, seconds=391.6), "wake me up in , , and")) self.assertEqual(extract_duration("10-seconds"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-minutes"), (timedelta(minutes=5), "")) def test_extract_duration_case_en(self): self.assertEqual(extract_duration("Set a timer for 30 minutes"), (timedelta(minutes=30), "Set a timer for")) self.assertEqual(extract_duration("The movie is one hour, fifty seven" " and a half minutes long"), (timedelta(hours=1, minutes=57.5), "The movie is , long")) self.assertEqual(extract_duration("Four and a Half minutes until" " sunset"), (timedelta(minutes=4.5), "until sunset")) self.assertEqual(extract_duration("Nineteen minutes past THE hour"), (timedelta(minutes=19), "past THE hour")) def test_extractdatetime_fractions_en(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("Set the ambush for half an hour", "2017-06-27 13:34:00", "set ambush") testExtract("remind me to call mom in half an hour", "2017-06-27 13:34:00", "remind me to call mom") testExtract("remind me to call mom in a half hour", "2017-06-27 13:34:00", "remind me to call mom") testExtract("remind me to call mom in a quarter hour", "2017-06-27 13:19:00", "remind me to call mom") testExtract("remind me to call mom in a quarter of an hour", "2017-06-27 13:19:00", "remind me to call mom") def test_extractdatetime_en(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("now is the time", "2017-06-27 13:04:00", "is time") testExtract("in a second", "2017-06-27 13:04:01", "") testExtract("in a minute", "2017-06-27 13:05:00", "") testExtract("in a couple minutes", "2017-06-27 13:06:00", "") testExtract("in a couple of minutes", "2017-06-27 13:06:00", "") testExtract("in a couple hours", "2017-06-27 15:04:00", "") testExtract("in a couple of hours", "2017-06-27 15:04:00", "") testExtract("in a couple weeks", "2017-07-11 00:00:00", "") testExtract("in a couple of weeks", "2017-07-11 00:00:00", "") testExtract("in a couple months", "2017-08-27 00:00:00", "") testExtract("in a couple years", "2019-06-27 00:00:00", "") testExtract("in a couple of months", "2017-08-27 00:00:00", "") testExtract("in a couple of years", "2019-06-27 00:00:00", "") testExtract("in a decade", "2027-06-27 00:00:00", "") testExtract("in a couple of decades", "2037-06-27 00:00:00", "") testExtract("next decade", "2027-06-27 00:00:00", "") testExtract("in a century", "2117-06-27 00:00:00", "") testExtract("in a millennium", "3017-06-27 00:00:00", "") testExtract("in a couple decades", "2037-06-27 00:00:00", "") testExtract("in 5 decades", "2067-06-27 00:00:00", "") testExtract("in a couple centuries", "2217-06-27 00:00:00", "") testExtract("in a couple of centuries", "2217-06-27 00:00:00", "") testExtract("in 2 centuries", "2217-06-27 00:00:00", "") testExtract("in a couple millenniums", "4017-06-27 00:00:00", "") testExtract("in a couple of millenniums", "4017-06-27 00:00:00", "") testExtract("in an hour", "2017-06-27 14:04:00", "") testExtract("i want it within the hour", "2017-06-27 14:04:00", "i want it") testExtract("in 1 second", "2017-06-27 13:04:01", "") testExtract("in 2 seconds", "2017-06-27 13:04:02", "") testExtract("Set the ambush in 1 minute", "2017-06-27 13:05:00", "set ambush") testExtract("Set the ambush for 5 days from today", "2017-07-02 00:00:00", "set ambush") testExtract("day after tomorrow", "2017-06-29 00:00:00", "") testExtract("What is the day after tomorrow's weather?", "2017-06-29 00:00:00", "what is weather") testExtract("Remind me at 10:45 pm", "2017-06-27 22:45:00", "remind me") testExtract("what is the weather on friday morning", "2017-06-30 08:00:00", "what is weather") testExtract("what is tomorrow's weather", "2017-06-28 00:00:00", "what is weather") testExtract("what is this afternoon's weather", "2017-06-27 15:00:00", "what is weather") testExtract("what is this evening's weather", "2017-06-27 19:00:00", "what is weather") testExtract("what was this morning's weather", "2017-06-27 08:00:00", "what was weather") testExtract("remind me to call mom in 8 weeks and 2 days", "2017-08-24 00:00:00", "remind me to call mom") testExtract("remind me to call mom on august 3rd", "2017-08-03 00:00:00", "remind me to call mom") testExtract("remind me tomorrow to call mom at 7am", "2017-06-28 07:00:00", "remind me to call mom") testExtract("remind me tomorrow to call mom at 10pm", "2017-06-28 22:00:00", "remind me to call mom") testExtract("remind me to call mom at 7am", "2017-06-28 07:00:00", "remind me to call mom") testExtract("remind me to call mom in an hour", "2017-06-27 14:04:00", "remind me to call mom") testExtract("remind me to call mom at 1730", "2017-06-27 17:30:00", "remind me to call mom") testExtract("remind me to call mom at 0630", "2017-06-28 06:30:00", "remind me to call mom") testExtract("remind me to call mom at 06 30 hours", "2017-06-28 06:30:00", "remind me to call mom") testExtract("remind me to call mom at 06 30", "2017-06-28 06:30:00", "remind me to call mom") testExtract("remind me to call mom at 06 30 hours", "2017-06-28 06:30:00", "remind me to call mom") testExtract("remind me to call mom at 7 o'clock", "2017-06-27 19:00:00", "remind me to call mom") testExtract("remind me to call mom this evening at 7 o'clock", "2017-06-27 19:00:00", "remind me to call mom") testExtract("remind me to call mom at 7 o'clock tonight", "2017-06-27 19:00:00", "remind me to call mom") testExtract("remind me to call mom at 7 o'clock in the morning", "2017-06-28 07:00:00", "remind me to call mom") testExtract("remind me to call mom Thursday evening at 7 o'clock", "2017-06-29 19:00:00", "remind me to call mom") testExtract("remind me to call mom Thursday morning at 7 o'clock", "2017-06-29 07:00:00", "remind me to call mom") testExtract("remind me to call mom at 7 o'clock Thursday morning", "2017-06-29 07:00:00", "remind me to call mom") testExtract("remind me to call mom at 7:00 Thursday morning", "2017-06-29 07:00:00", "remind me to call mom") # TODO: This test is imperfect due to the "at 7:00" still in the # remainder. But let it pass for now since time is correct testExtract("remind me to call mom at 7:00 Thursday evening", "2017-06-29 19:00:00", "remind me to call mom at 7:00") testExtract("remind me to call mom at 8 Wednesday evening", "2017-06-28 20:00:00", "remind me to call mom") testExtract("remind me to call mom at 8 Wednesday in the evening", "2017-06-28 20:00:00", "remind me to call mom") testExtract("remind me to call mom Wednesday evening at 8", "2017-06-28 20:00:00", "remind me to call mom") testExtract("remind me to call mom in two hours", "2017-06-27 15:04:00", "remind me to call mom") testExtract("remind me to call mom in 2 hours", "2017-06-27 15:04:00", "remind me to call mom") testExtract("remind me to call mom in 15 minutes", "2017-06-27 13:19:00", "remind me to call mom") testExtract("remind me to call mom in fifteen minutes", "2017-06-27 13:19:00", "remind me to call mom") testExtract("remind me to call mom at 10am 2 days after this saturday", "2017-07-03 10:00:00", "remind me to call mom") testExtract("Play Rick Astley music 2 days from Friday", "2017-07-02 00:00:00", "play rick astley music") testExtract("Begin the invasion at 3:45 pm on Thursday", "2017-06-29 15:45:00", "begin invasion") testExtract("On Monday, order pie from the bakery", "2017-07-03 00:00:00", "order pie from bakery") testExtract("Play Happy Birthday music 5 years from today", "2022-06-27 00:00:00", "play happy birthday music") testExtract("Skype Mom at 12:45 pm next Thursday", "2017-07-06 12:45:00", "skype mom") testExtract("What's the weather next Friday?", "2017-06-30 00:00:00", "what weather") testExtract("What's the weather next Wednesday?", "2017-07-05 00:00:00", "what weather") testExtract("What's the weather next Thursday?", "2017-07-06 00:00:00", "what weather") testExtract("what is the weather next friday morning", "2017-06-30 08:00:00", "what is weather") testExtract("what is the weather next friday evening", "2017-06-30 19:00:00", "what is weather") testExtract("what is the weather next friday afternoon", "2017-06-30 15:00:00", "what is weather") testExtract("remind me to call mom on august 3rd", "2017-08-03 00:00:00", "remind me to call mom") testExtract("Buy fireworks on the 4th of July", "2017-07-04 00:00:00", "buy fireworks") testExtract("what is the weather 2 weeks from next friday", "2017-07-14 00:00:00", "what is weather") testExtract("what is the weather wednesday at 0700 hours", "2017-06-28 07:00:00", "what is weather") testExtract("set an alarm wednesday at 7 o'clock", "2017-06-28 07:00:00", "set alarm") testExtract("Set up an appointment at 12:45 pm next Thursday", "2017-07-06 12:45:00", "set up appointment") testExtract("What's the weather this Thursday?", "2017-06-29 00:00:00", "what weather") testExtract("set up the visit for 2 weeks and 6 days from Saturday", "2017-07-21 00:00:00", "set up visit") testExtract("Begin the invasion at 03 45 on Thursday", "2017-06-29 03:45:00", "begin invasion") testExtract("Begin the invasion at o 800 hours on Thursday", "2017-06-29 08:00:00", "begin invasion") testExtract("Begin the party at 8 o'clock in the evening on Thursday", "2017-06-29 20:00:00", "begin party") testExtract("Begin the invasion at 8 in the evening on Thursday", "2017-06-29 20:00:00", "begin invasion") testExtract("Begin the invasion on Thursday at noon", "2017-06-29 12:00:00", "begin invasion") testExtract("Begin the invasion on Thursday at midnight", "2017-06-29 00:00:00", "begin invasion") testExtract("Begin the invasion on Thursday at 0500", "2017-06-29 05:00:00", "begin invasion") testExtract("remind me to wake up in 4 years", "2021-06-27 00:00:00", "remind me to wake up") testExtract("remind me to wake up in 4 years and 4 days", "2021-07-01 00:00:00", "remind me to wake up") testExtract("What is the weather 3 days after tomorrow?", "2017-07-01 00:00:00", "what is weather") testExtract("december 3", "2017-12-03 00:00:00", "") testExtract("lets meet at 8:00 tonight", "2017-06-27 20:00:00", "lets meet") testExtract("lets meet at 5pm", "2017-06-27 17:00:00", "lets meet") testExtract("lets meet at 8 a.m.", "2017-06-28 08:00:00", "lets meet") testExtract("remind me to wake up at 8 a.m", "2017-06-28 08:00:00", "remind me to wake up") testExtract("what is the weather on tuesday", "2017-06-27 00:00:00", "what is weather") testExtract("what is the weather on monday", "2017-07-03 00:00:00", "what is weather") testExtract("what is the weather this wednesday", "2017-06-28 00:00:00", "what is weather") testExtract("on thursday what is the weather", "2017-06-29 00:00:00", "what is weather") testExtract("on this thursday what is the weather", "2017-06-29 00:00:00", "what is weather") testExtract("on last monday what was the weather", "2017-06-26 00:00:00", "what was weather") testExtract("set an alarm for wednesday evening at 8", "2017-06-28 20:00:00", "set alarm") testExtract("set an alarm for wednesday at 3 o'clock in the afternoon", "2017-06-28 15:00:00", "set alarm") testExtract("set an alarm for wednesday at 3 o'clock in the morning", "2017-06-28 03:00:00", "set alarm") testExtract("set an alarm for wednesday morning at 7 o'clock", "2017-06-28 07:00:00", "set alarm") testExtract("set an alarm for today at 7 o'clock", "2017-06-27 19:00:00", "set alarm") testExtract("set an alarm for this evening at 7 o'clock", "2017-06-27 19:00:00", "set alarm") # TODO: This test is imperfect due to the "at 7:00" still in the # remainder. But let it pass for now since time is correct testExtract("set an alarm for this evening at 7:00", "2017-06-27 19:00:00", "set alarm at 7:00") testExtract("on the evening of june 5th 2017 remind me to" + " call my mother", "2017-06-05 19:00:00", "remind me to call my mother") # TODO: This test is imperfect due to the missing "for" in the # remainder. But let it pass for now since time is correct testExtract("update my calendar for a morning meeting with julius" + " on march 4th", "2018-03-04 08:00:00", "update my calendar meeting with julius") testExtract("remind me to call mom next tuesday", "2017-07-04 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 3 weeks", "2017-07-18 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 8 weeks", "2017-08-22 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 8 weeks and 2 days", "2017-08-24 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 4 days", "2017-07-01 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 3 months", "2017-09-27 00:00:00", "remind me to call mom") testExtract("remind me to call mom in 2 years and 2 days", "2019-06-29 00:00:00", "remind me to call mom") testExtract("remind me to call mom next week", "2017-07-04 00:00:00", "remind me to call mom") testExtract("remind me to call mom at 10am on saturday", "2017-07-01 10:00:00", "remind me to call mom") testExtract("remind me to call mom at 10am this saturday", "2017-07-01 10:00:00", "remind me to call mom") testExtract("remind me to call mom at 10 next saturday", "2017-07-01 10:00:00", "remind me to call mom") testExtract("remind me to call mom at 10am next saturday", "2017-07-01 10:00:00", "remind me to call mom") # test yesterday testExtract("what day was yesterday", "2017-06-26 00:00:00", "what day was") testExtract("what day was the day before yesterday", "2017-06-25 00:00:00", "what day was") testExtract("i had dinner yesterday at 6", "2017-06-26 06:00:00", "i had dinner") testExtract("i had dinner yesterday at 6 am", "2017-06-26 06:00:00", "i had dinner") testExtract("i had dinner yesterday at 6 pm", "2017-06-26 18:00:00", "i had dinner") # Below two tests, ensure that time is picked # even if no am/pm is specified # in case of weekdays/tonight testExtract("set alarm for 9 on weekdays", "2017-06-27 21:00:00", "set alarm weekdays") testExtract("for 8 tonight", "2017-06-27 20:00:00", "") testExtract("for 8:30pm tonight", "2017-06-27 20:30:00", "") # Tests a time with ':' & without am/pm testExtract("set an alarm for tonight 9:30", "2017-06-27 21:30:00", "set alarm") testExtract("set an alarm at 9:00 for tonight", "2017-06-27 21:00:00", "set alarm") # Check if it picks the intent irrespective of correctness testExtract("set an alarm at 9 o'clock for tonight", "2017-06-27 21:00:00", "set alarm") testExtract("remind me about the game tonight at 11:30", "2017-06-27 23:30:00", "remind me about game") testExtract("set alarm at 7:30 on weekdays", "2017-06-27 19:30:00", "set alarm on weekdays") # "# days " testExtract("my birthday is 2 days from today", "2017-06-29 00:00:00", "my birthday is") testExtract("my birthday is 2 days after today", "2017-06-29 00:00:00", "my birthday is") testExtract("my birthday is 2 days from tomorrow", "2017-06-30 00:00:00", "my birthday is") testExtract("my birthday is 2 days after tomorrow", "2017-06-30 00:00:00", "my birthday is") testExtract("remind me to call mom at 10am 2 days after next saturday", "2017-07-10 10:00:00", "remind me to call mom") testExtract("my birthday is 2 days from yesterday", "2017-06-28 00:00:00", "my birthday is") testExtract("my birthday is 2 days after yesterday", "2017-06-28 00:00:00", "my birthday is") # "# days ago>" testExtract("my birthday was 1 day ago", "2017-06-26 00:00:00", "my birthday was") testExtract("my birthday was 2 days ago", "2017-06-25 00:00:00", "my birthday was") testExtract("my birthday was 3 days ago", "2017-06-24 00:00:00", "my birthday was") testExtract("my birthday was 4 days ago", "2017-06-23 00:00:00", "my birthday was") # TODO this test is imperfect due to "tonight" in the reminder, but let is pass since the date is correct testExtract("lets meet tonight", "2017-06-27 22:00:00", "lets meet tonight") # TODO this test is imperfect due to "at night" in the reminder, but let is pass since the date is correct testExtract("lets meet later at night", "2017-06-27 22:00:00", "lets meet later at night") # TODO this test is imperfect due to "night" in the reminder, but let is pass since the date is correct testExtract("what's the weather like tomorrow night", "2017-06-28 22:00:00", "what is weather like night") # TODO this test is imperfect due to "night" in the reminder, but let is pass since the date is correct testExtract("what's the weather like next tuesday night", "2017-07-04 22:00:00", "what is weather like night") def test_extract_date_years(self): date = datetime(2017, 6, 27, tzinfo=default_timezone()) # Tue June 27, 2017 self.assertEqual(extract_datetime('in 2007', date)[0], datetime(2007, 6, 27, tzinfo=date.tzinfo)) def test_extract_ambiguous_time_en(self): morning = datetime(2017, 6, 27, 8, 1, 2, tzinfo=default_timezone()) evening = datetime(2017, 6, 27, 20, 1, 2, tzinfo=default_timezone()) noonish = datetime(2017, 6, 27, 12, 1, 2, tzinfo=default_timezone()) self.assertEqual( extract_datetime('feed the fish'), None) self.assertEqual( extract_datetime('day'), None) self.assertEqual( extract_datetime('week'), None) self.assertEqual( extract_datetime('month'), None) self.assertEqual( extract_datetime('year'), None) self.assertEqual( extract_datetime(' '), None) self.assertEqual( extract_datetime('feed fish at 10 o\'clock', morning)[0], datetime(2017, 6, 27, 10, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('feed fish at 10 o\'clock', noonish)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('feed fish at 10 o\'clock', evening)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) def test_extract_date_with_may_I_en(self): now = datetime(2019, 7, 4, 8, 1, 2, tzinfo=default_timezone()) may_date = datetime(2019, 5, 2, 10, 11, 20, tzinfo=default_timezone()) self.assertEqual( extract_datetime('May I know what time it is tomorrow', now)[0], datetime(2019, 7, 5, 0, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('May I when 10 o\'clock is', now)[0], datetime(2019, 7, 4, 10, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('On 24th of may I want a reminder', may_date)[0], datetime(2019, 5, 24, 0, 0, 0, tzinfo=default_timezone())) def test_extract_with_other_tzinfo(self): local_tz = default_timezone() local_dt = datetime(2019, 7, 4, 7, 1, 2, tzinfo=local_tz) local_tz_offset = local_tz.utcoffset(local_dt) not_local_offset = local_tz_offset + timedelta(hours=1) not_local_tz = tz.tzoffset('TST', not_local_offset.total_seconds()) not_local_dt = datetime(2019, 7, 4, 8, 1, 2, tzinfo=not_local_tz) test_dt, remainder = extract_datetime("now is the time", not_local_dt) self.assertEqual((test_dt.year, test_dt.month, test_dt.day, test_dt.hour, test_dt.minute, test_dt.second, test_dt.tzinfo), (not_local_dt.year, not_local_dt.month, not_local_dt.day, not_local_dt.hour, not_local_dt.minute, not_local_dt.second, not_local_dt.tzinfo)) self.assertNotEqual((test_dt.year, test_dt.month, test_dt.day, test_dt.hour, test_dt.minute, test_dt.second, test_dt.tzinfo), (local_dt.year, local_dt.month, local_dt.day, local_dt.hour, local_dt.minute, local_dt.second, local_dt.tzinfo)) def test_extract_relativedatetime_en(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("lets meet in 5 minutes", "2017-06-27 10:06:02", "lets meet") testExtract("lets meet in 5minutes", "2017-06-27 10:06:02", "lets meet") testExtract("lets meet in 5 seconds", "2017-06-27 10:01:07", "lets meet") testExtract("lets meet in 1 hour", "2017-06-27 11:01:02", "lets meet") testExtract("lets meet in 2 hours", "2017-06-27 12:01:02", "lets meet") testExtract("lets meet in 2hours", "2017-06-27 12:01:02", "lets meet") testExtract("lets meet in 1 minute", "2017-06-27 10:02:02", "lets meet") testExtract("lets meet in 1 second", "2017-06-27 10:01:03", "lets meet") testExtract("lets meet in 5seconds", "2017-06-27 10:01:07", "lets meet") def test_normalize_numbers(self): self.assertEqual(normalize("remind me to do something at two to two"), "remind me to do something at 2 to 2") self.assertEqual(normalize('what time will it be in two minutes'), 'what time will it be in 2 minutes') self.assertEqual(normalize('What time will it be in twenty two minutes'), 'What time will it be in 22 minutes') self.assertEqual(normalize("remind me to do something at twenty to two"), "remind me to do something at 20 to 2") # TODO imperfect test, maybe should return 'my favorite numbers are 20 2', # let is pass for now since this is likely a STT issue if ever # encountered in the wild and is somewhat ambiguous, if this was # spoken by a human the result is what we expect, if in written form # it is ambiguous but could mean separate numbers self.assertEqual(normalize('my favorite numbers are twenty 2'), 'my favorite numbers are 22') # TODO imperfect test, same as above, fixing would impact # extract_numbers quite a bit and require a non trivial ammount of # refactoring self.assertEqual(normalize('my favorite numbers are 20 2'), 'my favorite numbers are 22') # test ordinals self.assertEqual(normalize('this is the first'), 'this is first') self.assertEqual(normalize('this is the first second'), 'this is first second') self.assertEqual(normalize('this is the first second and third'), 'this is first second and third') # test fractions self.assertEqual(normalize('whole hour'), 'whole hour') self.assertEqual(normalize('quarter hour'), 'quarter hour') self.assertEqual(normalize('halve hour'), 'halve hour') self.assertEqual(normalize('half hour'), 'half hour') def test_extract_date_with_number_words(self): now = datetime(2019, 7, 4, 8, 1, 2, tzinfo=default_timezone()) self.assertEqual( extract_datetime('What time will it be in 2 minutes', now)[0], datetime(2019, 7, 4, 8, 3, 2, tzinfo=default_timezone())) self.assertEqual( extract_datetime('What time will it be in two minutes', now)[0], datetime(2019, 7, 4, 8, 3, 2, tzinfo=default_timezone())) self.assertEqual( extract_datetime('What time will it be in two hundred minutes', now)[0], datetime(2019, 7, 4, 11, 21, 2, tzinfo=default_timezone())) def test_spaces(self): self.assertEqual(normalize(" this is a test"), "this is test") self.assertEqual(normalize(" this is a test "), "this is test") self.assertEqual(normalize(" this is one test"), "this is 1 test") def test_numbers(self): self.assertEqual(normalize("this is a one two three test"), "this is 1 2 3 test") self.assertEqual(normalize(" it's a four five six test"), "it is 4 5 6 test") self.assertEqual(normalize("it's a seven eight nine test"), "it is 7 8 9 test") self.assertEqual(normalize("it's a seven eight nine test"), "it is 7 8 9 test") self.assertEqual(normalize("that's a ten eleven twelve test"), "that is 10 11 12 test") self.assertEqual(normalize("that's a thirteen fourteen test"), "that is 13 14 test") self.assertEqual(normalize("that's fifteen sixteen seventeen"), "that is 15 16 17") self.assertEqual(normalize("that's eighteen nineteen twenty"), "that is 18 19 20") self.assertEqual(normalize("that's one nineteen twenty two"), "that is 1 19 22") self.assertEqual(normalize("that's one hundred"), "that is 100") self.assertEqual(normalize("that's one two twenty two"), "that is 1 2 22") self.assertEqual(normalize("that's one and a half"), "that is 1 and half") self.assertEqual(normalize("that's one and a half and five six"), "that is 1 and half and 5 6") def test_multiple_numbers(self): self.assertEqual(extract_numbers("this is a one two three test"), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers("it's a four five six test"), [4.0, 5.0, 6.0]) self.assertEqual(extract_numbers("this is a ten eleven twelve test"), [10.0, 11.0, 12.0]) self.assertEqual(extract_numbers("this is a one twenty one test"), [1.0, 21.0]) self.assertEqual(extract_numbers("1 dog, seven pigs, macdonald had a " "farm, 3 times 5 macarena"), [1, 7, 3, 5]) self.assertEqual(extract_numbers("two beers for two bears"), [2.0, 2.0]) self.assertEqual(extract_numbers("twenty 20 twenty"), [20, 20, 20]) self.assertEqual(extract_numbers("twenty 20 22"), [20.0, 20.0, 22.0]) self.assertEqual(extract_numbers("twenty twenty two twenty"), [20, 22, 20]) self.assertEqual(extract_numbers("twenty 2"), [22.0]) self.assertEqual(extract_numbers("twenty 20 twenty 2"), [20, 20, 22]) self.assertEqual(extract_numbers("third one"), [1 / 3, 1]) self.assertEqual(extract_numbers("third one", ordinals=True), [3]) self.assertEqual(extract_numbers("six trillion", short_scale=True), [6e12]) self.assertEqual(extract_numbers("six trillion", short_scale=False), [6e18]) self.assertEqual(extract_numbers("two pigs and six trillion bacteria", short_scale=True), [2, 6e12]) self.assertEqual(extract_numbers("two pigs and six trillion bacteria", short_scale=False), [2, 6e18]) self.assertEqual(extract_numbers("thirty second or first", ordinals=True), [32, 1]) self.assertEqual(extract_numbers("this is a seven eight nine and a" " half test"), [7.0, 8.0, 9.5]) def test_contractions(self): self.assertEqual(normalize("ain't"), "is not") self.assertEqual(normalize("aren't"), "are not") self.assertEqual(normalize("can't"), "can not") self.assertEqual(normalize("could've"), "could have") self.assertEqual(normalize("couldn't"), "could not") self.assertEqual(normalize("didn't"), "did not") self.assertEqual(normalize("doesn't"), "does not") self.assertEqual(normalize("don't"), "do not") self.assertEqual(normalize("gonna"), "going to") self.assertEqual(normalize("gotta"), "got to") self.assertEqual(normalize("hadn't"), "had not") self.assertEqual(normalize("hadn't have"), "had not have") self.assertEqual(normalize("hasn't"), "has not") self.assertEqual(normalize("haven't"), "have not") # TODO: Ambiguous with "he had" self.assertEqual(normalize("he'd"), "he would") self.assertEqual(normalize("he'll"), "he will") # TODO: Ambiguous with "he has" self.assertEqual(normalize("he's"), "he is") # TODO: Ambiguous with "how would" self.assertEqual(normalize("how'd"), "how did") self.assertEqual(normalize("how'll"), "how will") # TODO: Ambiguous with "how has" and "how does" self.assertEqual(normalize("how's"), "how is") # TODO: Ambiguous with "I had" self.assertEqual(normalize("I'd"), "I would") self.assertEqual(normalize("I'll"), "I will") self.assertEqual(normalize("I'm"), "I am") self.assertEqual(normalize("I've"), "I have") self.assertEqual(normalize("I haven't"), "I have not") self.assertEqual(normalize("isn't"), "is not") self.assertEqual(normalize("it'd"), "it would") self.assertEqual(normalize("it'll"), "it will") # TODO: Ambiguous with "it has" self.assertEqual(normalize("it's"), "it is") self.assertEqual(normalize("it isn't"), "it is not") self.assertEqual(normalize("mightn't"), "might not") self.assertEqual(normalize("might've"), "might have") self.assertEqual(normalize("mustn't"), "must not") self.assertEqual(normalize("mustn't have"), "must not have") self.assertEqual(normalize("must've"), "must have") self.assertEqual(normalize("needn't"), "need not") self.assertEqual(normalize("oughtn't"), "ought not") self.assertEqual(normalize("shan't"), "shall not") # TODO: Ambiguous wiht "she had" self.assertEqual(normalize("she'd"), "she would") self.assertEqual(normalize("she hadn't"), "she had not") self.assertEqual(normalize("she'll"), "she will") self.assertEqual(normalize("she's"), "she is") self.assertEqual(normalize("she isn't"), "she is not") self.assertEqual(normalize("should've"), "should have") self.assertEqual(normalize("shouldn't"), "should not") self.assertEqual(normalize("shouldn't have"), "should not have") self.assertEqual(normalize("somebody's"), "somebody is") # TODO: Ambiguous with "someone had" self.assertEqual(normalize("someone'd"), "someone would") self.assertEqual(normalize("someone hadn't"), "someone had not") self.assertEqual(normalize("someone'll"), "someone will") # TODO: Ambiguous with "someone has" self.assertEqual(normalize("someone's"), "someone is") self.assertEqual(normalize("that'll"), "that will") # TODO: Ambiguous with "that has" self.assertEqual(normalize("that's"), "that is") # TODO: Ambiguous with "that had" self.assertEqual(normalize("that'd"), "that would") # TODO: Ambiguous with "there had" self.assertEqual(normalize("there'd"), "there would") self.assertEqual(normalize("there're"), "there are") # TODO: Ambiguous with "there has" self.assertEqual(normalize("there's"), "there is") # TODO: Ambiguous with "they had" self.assertEqual(normalize("they'd"), "they would") self.assertEqual(normalize("they'll"), "they will") self.assertEqual(normalize("they won't have"), "they will not have") self.assertEqual(normalize("they're"), "they are") self.assertEqual(normalize("they've"), "they have") self.assertEqual(normalize("they haven't"), "they have not") self.assertEqual(normalize("wasn't"), "was not") # TODO: Ambiguous wiht "we had" self.assertEqual(normalize("we'd"), "we would") self.assertEqual(normalize("we would've"), "we would have") self.assertEqual(normalize("we wouldn't"), "we would not") self.assertEqual(normalize("we wouldn't have"), "we would not have") self.assertEqual(normalize("we'll"), "we will") self.assertEqual(normalize("we won't have"), "we will not have") self.assertEqual(normalize("we're"), "we are") self.assertEqual(normalize("we've"), "we have") self.assertEqual(normalize("weren't"), "were not") self.assertEqual(normalize("what'd"), "what did") self.assertEqual(normalize("what'll"), "what will") self.assertEqual(normalize("what're"), "what are") # TODO: Ambiguous with "what has" / "what does") self.assertEqual(normalize("whats"), "what is") self.assertEqual(normalize("what's"), "what is") self.assertEqual(normalize("what've"), "what have") # TODO: Ambiguous with "when has" self.assertEqual(normalize("when's"), "when is") self.assertEqual(normalize("where'd"), "where did") # TODO: Ambiguous with "where has" / where does" self.assertEqual(normalize("where's"), "where is") self.assertEqual(normalize("where've"), "where have") # TODO: Ambiguous with "who had" "who did") self.assertEqual(normalize("who'd"), "who would") self.assertEqual(normalize("who'd've"), "who would have") self.assertEqual(normalize("who'll"), "who will") self.assertEqual(normalize("who're"), "who are") # TODO: Ambiguous with "who has" / "who does" self.assertEqual(normalize("who's"), "who is") self.assertEqual(normalize("who've"), "who have") self.assertEqual(normalize("why'd"), "why did") self.assertEqual(normalize("why're"), "why are") # TODO: Ambiguous with "why has" / "why does" self.assertEqual(normalize("why's"), "why is") self.assertEqual(normalize("won't"), "will not") self.assertEqual(normalize("won't've"), "will not have") self.assertEqual(normalize("would've"), "would have") self.assertEqual(normalize("wouldn't"), "would not") self.assertEqual(normalize("wouldn't've"), "would not have") self.assertEqual(normalize("ya'll"), "you all") self.assertEqual(normalize("y'all"), "you all") self.assertEqual(normalize("y'ain't"), "you are not") # TODO: Ambiguous with "you had" self.assertEqual(normalize("you'd"), "you would") self.assertEqual(normalize("you'd've"), "you would have") self.assertEqual(normalize("you'll"), "you will") self.assertEqual(normalize("you're"), "you are") self.assertEqual(normalize("you aren't"), "you are not") self.assertEqual(normalize("you've"), "you have") self.assertEqual(normalize("you haven't"), "you have not") def test_combinations(self): self.assertEqual(normalize("I couldn't have guessed there'd be two"), "I could not have guessed there would be 2") self.assertEqual(normalize("I wouldn't have"), "I would not have") self.assertEqual(normalize("I hadn't been there"), "I had not been there") self.assertEqual(normalize("I would've"), "I would have") self.assertEqual(normalize("it hadn't"), "it had not") self.assertEqual(normalize("it hadn't have"), "it had not have") self.assertEqual(normalize("it would've"), "it would have") self.assertEqual(normalize("she wouldn't have"), "she would not have") self.assertEqual(normalize("she would've"), "she would have") self.assertEqual(normalize("someone wouldn't have"), "someone would not have") self.assertEqual(normalize("someone would've"), "someone would have") self.assertEqual(normalize("what's the weather like"), "what is weather like") self.assertEqual(normalize("that's what I told you"), "that is what I told you") self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4") # TODO not localized; needed in english? def test_gender(self): self.assertRaises((AttributeError, FunctionNotLocalizedError), get_gender, "person", None) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_ca.py000066400000000000000000000377251426211343400222140ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import get_gender from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language('ca-es') set_default_lang('ca') def tearDownModule(): unload_language('ca') class TestNormalize(unittest.TestCase): """ Test cases for Catalan parsing """ def test_articles_ca(self): self.assertEqual(normalize("aquesta és la prova", lang="ca", remove_articles=True), "és prova") self.assertEqual( normalize("això és una frase", lang="ca", remove_articles=True), "això és 1 frase") self.assertEqual( normalize("i una altra prova", lang="ca", remove_articles=True), "1 altra prova") self.assertEqual(normalize("això és un test extra", lang="ca", remove_articles=False), "això és 1 test extra") def test_extractnumber_ca(self): self.assertEqual(extract_number("aquest és el primer intent", lang="ca"), 1) self.assertEqual(extract_number("i aquesta la segona prova", lang="ca"), 2) self.assertEqual(extract_number("això l'intent 2", lang="ca"), 2) self.assertEqual(extract_number("això és un terç de pizza", lang="ca"), 1.0 / 3.0) self.assertEqual(extract_number("axiò és la prova del número quatre", lang="ca"), 4) self.assertEqual(extract_number("un terç de tassa", lang="ca"), 1.0 / 3.0) self.assertEqual(extract_number("3 tasses", lang="ca"), 3) self.assertEqual(extract_number("1/3 tassa", lang="ca"), 1.0 / 3.0) self.assertEqual(extract_number("quart d'hora", lang="ca"), 0.25) self.assertEqual(extract_number("1/4 hora", lang="ca"), 0.25) self.assertEqual(extract_number("un quart d'hora", lang="ca"), 0.25) self.assertEqual(extract_number("2/3 pinga", lang="ca"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 pinga", lang="ca"), 3.0 / 4.0) self.assertEqual(extract_number("1 i 3/4 cafè", lang="ca"), 1.75) self.assertEqual(extract_number("1 cafè i mig", lang="ca"), 1.5) self.assertEqual(extract_number("un cafè i un mig", lang="ca"), 1.5) self.assertEqual( extract_number("tres quarts de xocolata", lang="ca"), 3.0 / 4.0) self.assertEqual( extract_number("Tres quarts de xocolate", lang="ca"), 3.0 / 4.0) self.assertEqual(extract_number("tres quart de xocolata", lang="ca"), 3.0 / 4.0) self.assertEqual(extract_number("set coma cinc", lang="ca"), 7.5) self.assertEqual(extract_number("set coma 5", lang="ca"), 7.5) self.assertEqual(extract_number("set i mig", lang="ca"), 7.5) self.assertEqual(extract_number("set amb vuitanta", lang="ca"), 7.80) self.assertEqual(extract_number("set i vuit", lang="ca"), 7.8) self.assertEqual(extract_number("set coma zero vuit", lang="ca"), 7.08) self.assertEqual(extract_number("set coma zero zero vuit", lang="ca"), 7.008) self.assertEqual(extract_number("vint trenta ens", lang="ca"), 20.0 / 30.0) self.assertEqual(extract_number("dos", lang="ca"), 2) self.assertEqual(extract_number("dues", lang="ca"), 2) self.assertEqual(extract_number("tres", lang="ca"), 3) self.assertEqual(extract_number("quatre", lang="ca"), 4) self.assertEqual(extract_number("deu", lang="ca"), 10) self.assertEqual(extract_number("trenta-cinc", lang="ca"), 35) self.assertEqual(extract_number("seixanta-sis", lang="ca"), 66) self.assertEqual(extract_number("vint-i-dues", lang="ca"), 22) self.assertEqual(extract_number("vint-i-dos", lang="ca"), 22) self.assertEqual(extract_number("quatre-centes", lang="ca"), 400) self.assertEqual(extract_number("cinc-cents", lang="ca"), 500) self.assertEqual(extract_number("sis coma sis-cents seixanta", lang="ca"), 6.66) self.assertEqual(extract_number("sis-cents seixanta-sis", lang="ca"), 666) self.assertEqual(extract_number("sis-cents punt zero sis", lang="ca"), 600.06) self.assertEqual(extract_number("sis-cents coma zero zero sis", lang="ca"), 600.006) self.assertEqual(extract_number("tres-cents coma zero zero tres", lang="ca"), 300.003) def test_agressive_pruning_ca(self): self.assertEqual(normalize("una paraula", lang="ca"), "1 paraula") self.assertEqual(normalize("un mot", lang="ca"), "1 mot") self.assertEqual(normalize("aquesta paraula u", lang="ca"), "paraula 1") self.assertEqual(normalize("l'home el va pegar", lang="ca"), "l'home va pegar") self.assertEqual(normalize("qui va equivocar-se aquell dia", lang="ca"), "qui va equivocar-se dia") def test_spaces_ca(self): self.assertEqual(normalize(" això és el test", lang="ca"), "això és test") self.assertEqual(normalize(" això és l'intent", lang="ca"), "això és l'intent") self.assertEqual(normalize(" això són les proves ", lang="ca"), "això són proves") self.assertEqual(normalize(" això és un test", lang="ca", remove_articles=False), "això és 1 test") def test_numbers_ca(self): self.assertEqual(normalize("això és el test un dos tres", lang="ca"), "això és test 1 2 3") self.assertEqual(normalize("és una prova set vuit nou huit", lang="ca"), "és 1 prova 7 8 9 8") self.assertEqual( normalize("prova zero deu onze dotze tretze", lang="ca"), "prova 0 10 11 12 13") #TODO: seixanta-sis > 66 #self.assertEqual( # normalize("prova 1000 600 seixanta-sis", lang="ca", # remove_articles=False), # "prova 1000 600 66") #TODO: mil dotze > 1012 #self.assertEqual( # normalize("prova mil dotze", lang="ca", # remove_articles=False), # "prova 1012") #TODO: dues-centes vint-i-quatre > 224 #self.assertEqual( # normalize("prova dues-centes vint-i-quatre", lang="ca", # remove_articles=False), # "prova 224") self.assertEqual( normalize("test set i mig", lang="ca", remove_articles=False), "test 7 mig") self.assertEqual( normalize("test dos punt nou", lang="ca"), "test 2 punt 9") self.assertEqual( normalize("test cent i nou", lang="ca", remove_articles=False), "test 100 9") self.assertEqual( normalize("test vint i 1", lang="ca"), "test 20 1") def test_extractdatetime_ca(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang="ca") extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("quin dia és avui", "2017-06-27 00:00:00", "dia") testExtract("quin dia som avui", "2017-06-27 00:00:00", "dia") testExtract("quin dia és demà", "2017-06-28 00:00:00", "dia") testExtract("quin dia va ser ahir", "2017-06-26 00:00:00", "dia ser") testExtract("quin dia va ser abans ahir", "2017-06-25 00:00:00", "dia ser") testExtract("quin dia va ser abans d'ahir", "2017-06-25 00:00:00", "dia ser") testExtract("quin dia va ser abans-d'ahir", "2017-06-25 00:00:00", "dia ser") testExtract("quin dia va ser abans d'abans d'ahir", "2017-06-24 00:00:00", "dia ser") testExtract("fer el sopar d'aquí 5 dies", "2017-07-02 00:00:00", "fer sopar aquí") testExtract("fer el sopar en 5 dies", "2017-07-02 00:00:00", "fer sopar") testExtract("quin temps farà demà?", "2017-06-28 00:00:00", "temps farà") testExtract("quin temps farà demà-passat?", "2017-06-29 00:00:00", "temps farà") testExtract("quin temps farà despús-demà?", "2017-06-29 00:00:00", "temps farà") testExtract("quin temps farà despús demà?", "2017-06-29 00:00:00", "temps farà") testExtract("truca a la mare les 10:45 pm", "2017-06-27 22:45:00", "truca mare") testExtract("quin temps fa el divendres de matí", "2017-06-30 08:00:00", "temps fa") testExtract("truca'm per a quedar d'aquí a 8 setmanes i 2 dies", "2017-08-24 00:00:00", "truca m quedar aquí i") testExtract("Toca black-metal 2 dies després de divendres", "2017-07-02 00:00:00", "toca black-metal") testExtract("Toca satanic black metal 2 dies per a aquest divendres", "2017-07-02 00:00:00", "toca satanic black metal") testExtract("Toca super black metal 2 dies a partir d'aquest divendres", "2017-07-02 00:00:00", "toca super black metal") testExtract("Começa la invasió a les 3:45 pm de dijous", "2017-06-29 15:45:00", "começa invasió") testExtract("dilluns, compra formatge", "2017-07-03 00:00:00", "compra formatge") testExtract("Envia felicitacions d'aquí a 5 anys", "2022-06-27 00:00:00", "envia felicitacions aquí") testExtract("Envia felicitacions en 5 anys", "2022-06-27 00:00:00", "envia felicitacions") testExtract("Truca per Skype a la mare pròxim dijous a les 12:45 pm", "2017-06-29 12:45:00", "truca skype mare") testExtract("quin temps fa aquest divendres?", "2017-06-30 00:00:00", "temps fa") testExtract("quin temps fa aquest divendres per la tarda?", "2017-06-30 15:00:00", "temps fa") testExtract("quin temps farà aquest divendres de matinada?", "2017-06-30 04:00:00", "temps farà") testExtract("quin temps fa aquest divendres a mitja nit?", "2017-06-30 00:00:00", "temps fa mitjanit") testExtract("quin temps fa aquest divendres al migdia?", "2017-06-30 12:00:00", "temps fa") testExtract("quin temps fa aquest divendres al final de tarda?", "2017-06-30 19:00:00", "temps fa") testExtract("quin temps fa aquest divendres a mig matí?", "2017-06-30 10:00:00", "temps fa") testExtract("recorda de trucar a la mare el dia 3 d'agost", "2017-08-03 00:00:00", "recorda trucar mare") testExtract("compra ganivets el 13 de maig", "2018-05-13 00:00:00", "compra ganivets") testExtract("gasta diners el dia 13 de maig", "2018-05-13 00:00:00", "gasta diners") testExtract("compra espelmes el 13 de maig", "2018-05-13 00:00:00", "compra espelmes") testExtract("beure cervesa el 13 de maig", "2018-05-13 00:00:00", "beure cervesa") testExtract("quin temps farà 1 dia després de demà", "2017-06-29 00:00:00", "temps farà") testExtract("quin temps farà a les 0700 hores", "2017-06-27 07:00:00", "temps farà") testExtract("quin temps farà demà a les 7 en punt", "2017-06-28 07:00:00", "temps farà") testExtract("quin temps farà demà a les 2 de la tarda", "2017-06-28 14:00:00", "temps farà") testExtract("quin temps farà demà a les 2", "2017-06-28 02:00:00", "temps farà") testExtract("quin temps farà a les 2 de la tarda de divendres vinent", "2017-06-30 14:00:00", "temps farà vinent") testExtract("recorda'm de despertar en 4 anys", "2021-06-27 00:00:00", "recorda m despertar") testExtract("recorda'm de despertar en 4 anys i 4 dies", "2021-07-01 00:00:00", "recorda m despertar i") #testExtract("dorm 3 dies després de demà", # "2017-07-02 00:00:00", "dorm") testExtract("concerta cita d'aquí a 2 setmanes i 6 dies després de dissabte", "2017-07-21 00:00:00", "concerta cita aquí i") testExtract("comença la festa a les 8 en punt de la nit de dijous", "2017-06-29 20:00:00", "comença festa") def test_extractdatetime_default_ca(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime( 'concerta cita per a 2 setmanes i 6 dies després de dissabte', anchor, lang='ca-es', default_time=default) self.assertEqual(default, res[0].time()) class TestExtractGender(unittest.TestCase): def test_gender_ca(self): # words with well defined grammatical gender rules self.assertEqual(get_gender("vaca", lang="ca"), "f") self.assertEqual(get_gender("cavall", lang="ca"), "m") self.assertEqual(get_gender("vaques", lang="ca"), "f") # words specifically defined in a lookup dictionary self.assertEqual(get_gender("home", lang="ca"), "m") self.assertEqual(get_gender("dona", lang="ca"), "f") self.assertEqual(get_gender("homes", lang="ca"), "m") self.assertEqual(get_gender("dones", lang="ca"), "f") # words where gender rules do not work but context does self.assertEqual(get_gender("bou", lang="ca"), None) self.assertEqual(get_gender("bou", "el bou menja herba", lang="ca"), "m") self.assertEqual(get_gender("home", "aquest home menja bous", lang="ca"), "m") self.assertEqual(get_gender("pont", lang="ca"), None) self.assertEqual(get_gender("pont", "aquest pont ha caigut", lang="ca"), "m") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_common.py000066400000000000000000000024471426211343400231120ustar00rootroot00000000000000# # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from lingua_franca.lang.parse_common import tokenize, Token class TestParseCommon(unittest.TestCase): def test_tokenize(self): self.assertEqual(tokenize('One small step for man'), [Token('One', 0), Token('small', 1), Token('step', 2), Token('for', 3), Token('man', 4)]) self.assertEqual(tokenize('15%'), [Token('15', 0), Token('%', 1)]) self.assertEqual(tokenize('I am #1'), [Token('I', 0), Token('am', 1), Token('#', 2), Token('1', 3)]) self.assertEqual(tokenize('hashtag #1world'), [Token('hashtag', 0), Token('#1world', 1)]) lingua-franca-release-v0.4.3/test/test_parse_cs.py000066400000000000000000001214121426211343400222210ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, timedelta from lingua_franca import get_default_lang, set_default_lang, \ load_language, unload_language from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import fuzzy_match from lingua_franca.parse import get_gender from lingua_franca.parse import match_one from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language("cs-cz") set_default_lang("cs") def tearDownModule(): unload_language("cs") class TestFuzzyMatch(unittest.TestCase): def test_matches(self): self.assertTrue(fuzzy_match("ty a já", "ty a já") >= 1.0) self.assertTrue(fuzzy_match("ty a já", "ty") < 0.5) self.assertTrue(fuzzy_match("Ty", "ty") >= 0.5) self.assertTrue(fuzzy_match("ty a já", "ty") == fuzzy_match("ty", "ty a já")) self.assertTrue(fuzzy_match("ty a já", "on nebo oni") < 0.23) def test_match_one(self): # test list of choices choices = ['frank', 'kate', 'harry', 'henry'] self.assertEqual(match_one('frank', choices)[0], 'frank') self.assertEqual(match_one('fran', choices)[0], 'frank') self.assertEqual(match_one('enry', choices)[0], 'henry') self.assertEqual(match_one('katt', choices)[0], 'kate') # test dictionary of choices choices = {'frank': 1, 'kate': 2, 'harry': 3, 'henry': 4} self.assertEqual(match_one('frank', choices)[0], 1) self.assertEqual(match_one('enry', choices)[0], 4) class TestNormalize(unittest.TestCase): def test_extract_number(self): self.assertEqual(extract_number("tohle je první test", ordinals=True), 1) self.assertEqual(extract_number("tohle je 2 test"), 2) self.assertEqual(extract_number("tohle je druhý test", ordinals=True), 2) #self.assertEqual(extract_number("tohle je třetí test"), 1.0 / 3.0) self.assertEqual(extract_number("tohle je třetí test", ordinals=True), 3.0) self.assertEqual(extract_number("ten čtvrtý", ordinals=True), 4.0) self.assertEqual(extract_number( "ten třicátý šestý", ordinals=True), 36.0) self.assertEqual(extract_number("tohle je test číslo 4"), 4) self.assertEqual(extract_number("jedna třetina šálku"), 1.0 / 3.0) self.assertEqual(extract_number("tři šálky"), 3) self.assertEqual(extract_number("1/3 šálku"), 1.0 / 3.0) self.assertEqual(extract_number("čtvrtina šálku"), 0.25) self.assertEqual(extract_number("1/4 cup"), 0.25) self.assertEqual(extract_number("jedna čtvrtina šálku"), 0.25) self.assertEqual(extract_number("2/3 šálků"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 šálků"), 3.0 / 4.0) self.assertEqual(extract_number("1 a 3/4 šálků"), 1.75) self.assertEqual(extract_number("1 šálek a půl"), 1.5) self.assertEqual(extract_number("jeden šálek a polovina"), 1.5) self.assertEqual(extract_number("jedna a půl šálků"), 1.5) self.assertEqual(extract_number("jedna a jedna polovina šálků"), 1.5) self.assertEqual(extract_number("tři čtvrtina šálků"), 3.0 / 4.0) self.assertEqual(extract_number("tři čtvrtiny šálků"), 3.0 / 4.0) self.assertEqual(extract_number("dvacet dva"), 22) self.assertEqual(extract_number( "Dvacet dva s velkým písmenam na začátku"), 22) self.assertEqual(extract_number( "dvacet Dva s dva krát velkým písmem"), 22) self.assertEqual(extract_number( "dvacet Dva s různou velikostí písmen"), 22) self.assertEqual(extract_number("Dvacet dva a Tři Pětiny"), 22.6) self.assertEqual(extract_number("dvě sto"), 200) self.assertEqual(extract_number("devět tisíc"), 9000) self.assertEqual(extract_number("šest sto šedesát šest"), 666) self.assertEqual(extract_number("dva million"), 2000000) self.assertEqual(extract_number("dva million pět sto tisíc " "tun žhavého kovu"), 2500000) self.assertEqual(extract_number("šest trillion"), 6000000000000.0) self.assertEqual(extract_number("šest trilion", short_scale=False), 6e+18) self.assertEqual(extract_number("jedna tečka pět"), 1.5) self.assertEqual(extract_number("tři tečka čtrnáct"), 3.14) self.assertEqual(extract_number("nula tečka dva"), 0.2) self.assertEqual(extract_number("billion roků "), 1000000000.0) self.assertEqual(extract_number("bilion roků", short_scale=False), 1000000000000.0) self.assertEqual(extract_number("jedno sto tisíc"), 100000) self.assertEqual(extract_number("mínus 2"), -2) self.assertEqual(extract_number("záporné sedmdesát"), -70) self.assertEqual(extract_number("tisíc million"), 1000000000) self.assertEqual(extract_number("miliarda", short_scale=False), 1000000000) self.assertEqual(extract_number("šestina třetina"), 1 / 6 / 3) self.assertEqual(extract_number("šestina třetí", ordinals=True), 3) self.assertEqual(extract_number("třicet sekund"), 30) self.assertEqual(extract_number("třicátý druhý", ordinals=True), 32) self.assertEqual(extract_number("tohle je billiontý test", ordinals=True), 1e09) print("tohle udělat později") #self.assertEqual(extract_number("tohle je billiontý test"), 1e-9) self.assertEqual(extract_number("tohle je biliontý test", ordinals=True, short_scale=False), 1e12) print("tohle udělat později") # self.assertEqual(extract_number("tohle je biliontý test", # short_scale=False), 1e-12) # Verify non-power multiples of ten no longer discard # adjacent multipliers self.assertEqual(extract_number("dvacet tisíc"), 20000) self.assertEqual(extract_number("padesát million"), 50000000) # Verify smaller powers of ten no longer cause miscalculation of larger # powers of ten (see MycroftAI#86) self.assertEqual(extract_number("dvacet billion tři sto million \ devět sto padesát tisíc šest sto \ sedmdesát pět tečka osm"), 20300950675.8) self.assertEqual(extract_number("devět sto devadesát devět million devět \ sto devadesát devět tisíc devět \ sto devadesát devět tečka devět"), 999999999.9) # TODO why does "trillion" result in xxxx.0? self.assertEqual(extract_number("osm sto trillion dva sto \ padesát sedm"), 800000000000257.0) # TODO handle this case # self.assertEqual( # extract_number("6 dot six six six"), # 6.666) self.assertTrue(extract_number("Tenisový hráč je rychlý") is False) self.assertTrue(extract_number("křehký") is False) self.assertTrue(extract_number("křehká nula") is not False) self.assertEqual(extract_number("křehká nula"), 0) #self.assertTrue(extract_number("grobo 0") is not False) #self.assertEqual(extract_number("grobo 0"), 0) self.assertEqual(extract_number("dvojice piv"), 2) self.assertEqual(extract_number("dvojice sto piv"), 200) self.assertEqual(extract_number("dvojice tisíc piv"), 2000) self.assertEqual(extract_number( "tohle je 7 test", ordinals=True), 7) self.assertEqual(extract_number( "tohle je 7 test", ordinals=False), 7) self.assertTrue(extract_number("tohle je n. test") is False) self.assertEqual(extract_number("tohle je 1. test"), 1) self.assertEqual(extract_number("tohle je 2. test"), 2) self.assertEqual(extract_number("tohle je 3. test"), 3) self.assertEqual(extract_number("tohle je 31. test"), 31) self.assertEqual(extract_number("tohle je 32. test"), 32) self.assertEqual(extract_number("tohle je 33. test"), 33) self.assertEqual(extract_number("tohle je 34. test"), 34) self.assertEqual(extract_number("celkem 100%"), 100) def test_extract_duration_cs(self): self.assertEqual(extract_duration("10 sekund"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 minut"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 hodiny"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 dny"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 týdnů"), (timedelta(weeks=25), "")) self.assertEqual(extract_duration("sedm hodin"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 sekund"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("osm a polovina dne třicet" " devět sekund"), (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("Nastav časovač na 30 minut"), (timedelta(minutes=30), "nastav časovač na")) self.assertEqual(extract_duration("Čtyři a půl minuty do" " západu"), (timedelta(minutes=4.5), "do západu")) self.assertEqual(extract_duration("devatenáct minut po hodině"), (timedelta(minutes=19), "po hodině")) self.assertEqual(extract_duration("vzbuď mě za tři týdny, čtyři" " sto devadesát sedm dní, a" " tři sto 91.6 sekund"), (timedelta(weeks=3, days=497, seconds=391.6), "vzbuď mě za , , a")) self.assertEqual(extract_duration("film je jedna hodina, padesát sedm" " a půl minuty dlouhý"), (timedelta(hours=1, minutes=57.5), "film je , dlouhý")) self.assertEqual(extract_duration("10-sekund"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-minut"), (timedelta(minutes=5), "")) def test_extractdatetime_cs(self): def extractWithFormat(text): # Tue June 27, 2017 @ 1:04pm date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("nyní je čas", "2017-06-27 13:04:00", "je čas") testExtract("za sekundu", "2017-06-27 13:04:01", "") testExtract("za minutu", "2017-06-27 13:05:00", "") # testExtract("ve dvou minutách", # "2017-06-27 13:06:00", "") # testExtract("in a couple of minutes", # "2017-06-27 13:06:00", "") # testExtract("ve dvou hodinách", # "2017-06-27 15:04:00", "") # testExtract("in a couple of hours", # "2017-06-27 15:04:00", "") # testExtract("v dvoje týden", # "2017-07-11 00:00:00", "") # testExtract("in a couple of weeks", # "2017-07-11 00:00:00", "") # testExtract("v dvoje měsíc", # "2017-08-27 00:00:00", "") # testExtract("v dvoje rok", # "2019-06-27 00:00:00", "") # testExtract("in a couple of months", # "2017-08-27 00:00:00", "") # testExtract("in a couple of years", # "2019-06-27 00:00:00", "") testExtract("v desetiletí", "2027-06-27 00:00:00", "") # testExtract("in a couple of decades", # "2037-06-27 00:00:00", "") testExtract("další desetiletí", "2027-06-27 00:00:00", "") testExtract("v století", "2117-06-27 00:00:00", "") testExtract("v tisíciletí", "3017-06-27 00:00:00", "") testExtract("v dvoje desetiletí", "2037-06-27 00:00:00", "") testExtract("v 5 desetiletí", "2067-06-27 00:00:00", "") testExtract("v dvoje století", "2217-06-27 00:00:00", "") # testExtract("in a couple of centuries", # "2217-06-27 00:00:00", "") testExtract("v 2 století", "2217-06-27 00:00:00", "") testExtract("v dvoje tisíciletí", "4017-06-27 00:00:00", "") # testExtract("in a couple of millenniums", # "4017-06-27 00:00:00", "") testExtract("v hodina", "2017-06-27 14:04:00", "") testExtract("chci to během hodiny", "2017-06-27 14:04:00", "chci to") testExtract("za 1 sekundu", "2017-06-27 13:04:01", "") testExtract("za 2 sekundy", "2017-06-27 13:04:02", "") testExtract("Nastav časovač na 1 minutu", "2017-06-27 13:05:00", "nastav časovač") testExtract("Nastav časovač na půl hodina", "2017-06-27 13:34:00", "nastav časovač") testExtract("Nastav časovač na 5 den od dnes", "2017-07-02 00:00:00", "nastav časovač") testExtract("den po zítřku", "2017-06-29 00:00:00", "") testExtract("Jaké je počasí den po zítřku?", "2017-06-29 00:00:00", "jaké je počasí") testExtract("Připomeň mi v 10:45 pm", "2017-06-27 22:45:00", "připomeň mi") testExtract("jaké je počasí v pátek ráno", "2017-06-30 08:00:00", "jaké je počasí") testExtract("jaké je zítřejší počasí", "2017-06-28 00:00:00", "jaké je počasí") testExtract("jaké je počasí toto odpoledne", "2017-06-27 15:00:00", "jaké je počasí") testExtract("jaké je počasí tento večer", "2017-06-27 19:00:00", "jaké je počasí") testExtract("jaké bylo počasí toto ráno", "2017-06-27 08:00:00", "jaké bylo počasí") testExtract("připomeň mi abych zavolal mámě v 8 týden a 2 dny", "2017-08-24 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v srpen 3", "2017-08-03 00:00:00", "připomeň mi abych zavolal mámě") # přidat i třetího slovně testExtract("připomeň mi zítra abych zavolal mámě v 7am", "2017-06-28 07:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi zítra abych zavolal mámě v 10pm", "2017-06-28 22:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7am", "2017-06-28 07:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v hodina", "2017-06-27 14:04:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 1730", "2017-06-27 17:30:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 0630", "2017-06-28 06:30:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 06 30 hodina", "2017-06-28 06:30:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 06 30", "2017-06-28 06:30:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 06 30 hodina", "2017-06-28 06:30:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7 hodin", "2017-06-27 19:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě večer v 7 hodin", "2017-06-27 19:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7 hodin večer", "2017-06-27 19:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7 hodin ráno", "2017-06-28 07:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v Čtvrtek večer v 7 hodin", "2017-06-29 19:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v Čtvrtek ráno v 7 hodin", "2017-06-29 07:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7 hodin Čtvrtek ráno", "2017-06-29 07:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 7:00 Čtvrtek ráno", "2017-06-29 07:00:00", "připomeň mi abych zavolal mámě") # TODO: This test is imperfect due to "at 7:00" still in the # remainder. But let it pass for now since time is correct testExtract("připomeň mi abych zavolal mámě v 7:00 Čtvrtek večer", "2017-06-29 19:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 8 Středa večer", "2017-06-28 20:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 8 Středa v večer", "2017-06-28 20:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě Středa večer v 8", "2017-06-28 20:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za dvě hodiny", "2017-06-27 15:04:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za 2 hodiny", "2017-06-27 15:04:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za 15 minut", "2017-06-27 13:19:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za patnáct minut", "2017-06-27 13:19:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za půl hodina", "2017-06-27 13:34:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za půl hodina", "2017-06-27 13:34:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za čtvrt hodina", "2017-06-27 13:19:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě za čtvrt hodina", "2017-06-27 13:19:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 10am 2 den po této sobota", "2017-07-03 10:00:00", "připomeň mi abych zavolal mámě") testExtract("Přehraj Rick Astley hudbu 2 dny od Pátek", "2017-07-02 00:00:00", "přehraj rick astley hudbu") testExtract("Začni invazi v 3:45 pm v Čtvrtek", "2017-06-29 15:45:00", "začni invazi") testExtract("V Pondělí, objednej koláč z pekárny", "2017-07-03 00:00:00", "objednej koláč z pekárny") testExtract("Přehraj Happy Birthday hudbu 5 roků od dnes", "2022-06-27 00:00:00", "přehraj happy birthday hudbu") testExtract("Skype Mámě v 12:45 pm další Čtvrtek", "2017-07-06 12:45:00", "skype mámě") testExtract("Jaké je počasí příští Pátek?", "2017-06-30 00:00:00", "jaké je počasí") testExtract("Jaké je počasí příští Středa?", "2017-07-05 00:00:00", "jaké je počasí") testExtract("Jaké je počasí příští Čtvrtek?", "2017-07-06 00:00:00", "jaké je počasí") testExtract("Jaké je počasí příští pátek ráno", "2017-06-30 08:00:00", "jaké je počasí") testExtract("jaké je počasí příští pátek večer", "2017-06-30 19:00:00", "jaké je počasí") testExtract("jaké je počasí příští pátek odpoledne", "2017-06-30 15:00:00", "jaké je počasí") testExtract("připomeň mi abych zavolal mámě v srpen třetího", "2017-08-03 00:00:00", "připomeň mi abych zavolal mámě") testExtract("Kup ohňostroj v 4 Červenec", "2017-07-04 00:00:00", "kup ohňostroj") testExtract("jaké je počasí 2 týdny od další pátek", "2017-07-14 00:00:00", "jaké je počasí") testExtract("jaké je počasí Středa v 0700 hodina", "2017-06-28 07:00:00", "jaké je počasí") testExtract("Nastav budík Středa v 7 hodin", "2017-06-28 07:00:00", "nastav budík") testExtract("Nastav schůzku v 12:45 pm další Čtvrtek", "2017-07-06 12:45:00", "nastav schůzku") testExtract("Jaké je počasí tento Čtvrtek?", "2017-06-29 00:00:00", "jaké je počasí") testExtract("nastav návštěvu na 2 týdny a 6 dní od Sobota", "2017-07-21 00:00:00", "nastav návštěvu") testExtract("Zahaj invazi v 03 45 v Čtvrtek", "2017-06-29 03:45:00", "zahaj invazi") testExtract("Zahaj invazi v 800 hodin v Čtvrtek", "2017-06-29 08:00:00", "zahaj invazi") testExtract("Zahaj párty v 8 hodin v večer v Čtvrtek", "2017-06-29 20:00:00", "zahaj párty") testExtract("Zahaj invazi v 8 v večer v Čtvrtek", "2017-06-29 20:00:00", "zahaj invazi") testExtract("Zahaj invazi v Čtvrtek v poledne", "2017-06-29 12:00:00", "zahaj invazi") testExtract("Zahaj invazi v Čtvrtek v půlnoc", "2017-06-29 00:00:00", "zahaj invazi") testExtract("Zahaj invazi v Čtvrtek v 0500", "2017-06-29 05:00:00", "zahaj invazi") testExtract("připomeň mi abych vstal v 4 roky", "2021-06-27 00:00:00", "připomeň mi abych vstal") testExtract("připomeň mi abych vstal v 4 roky a 4 dny", "2021-07-01 00:00:00", "připomeň mi abych vstal") testExtract("jaké je počasí 3 dny po zítra?", "2017-07-01 00:00:00", "jaké je počasí") testExtract("prosinec 3", "2017-12-03 00:00:00", "") testExtract("sejdeme se v 8:00 dnes večer", "2017-06-27 20:00:00", "sejdeme se") testExtract("sejdeme se v 5pm", "2017-06-27 17:00:00", "sejdeme se") testExtract("sejdeme se v 8 am", "2017-06-28 08:00:00", "sejdeme se") testExtract("připomeň mi abych vstal v 8 am", "2017-06-28 08:00:00", "připomeň mi abych vstal") testExtract("jaké je počasí v úterý", "2017-06-27 00:00:00", "jaké je počasí") testExtract("jaké je počasí v pondělí", "2017-07-03 00:00:00", "jaké je počasí") testExtract("jaké je počasí toto Středa", "2017-06-28 00:00:00", "jaké je počasí") testExtract("v Čtvrtek jaké je počasí", "2017-06-29 00:00:00", "jaké je počasí") testExtract("tento Čtvrtek jaké je počasí", "2017-06-29 00:00:00", "jaké je počasí") testExtract("poslední pondělí jaké bylo počasí", "2017-06-26 00:00:00", "jaké bylo počasí") testExtract("nastav budík na Středa večer v 8", "2017-06-28 20:00:00", "nastav budík") testExtract("nastav budík na Středa v 3 hodiny v odpoledne", "2017-06-28 15:00:00", "nastav budík") testExtract("nastav budík na Středa v 3 hodiny v ráno", "2017-06-28 03:00:00", "nastav budík") testExtract("nastav budík na Středa ráno v 7 hodin", "2017-06-28 07:00:00", "nastav budík") testExtract("nastav budík na dnes v 7 hodin", "2017-06-27 19:00:00", "nastav budík") testExtract("nastav budík na tento večer v 7 hodin", "2017-06-27 19:00:00", "nastav budík") # TODO: This test is imperfect due to the "at 7:00" still in the # remainder. But let it pass for now since time is correct testExtract("nastav budík na tento večer v 7:00", "2017-06-27 19:00:00", "nastav budík v 7:00") testExtract("večer v červen 5 2017 připomeň mi" + " abych zavolal mámě", "2017-06-05 19:00:00", "připomeň mi abych zavolal mámě") # TODO: This test is imperfect due to the missing "for" in the # remainder. But let it pass for now since time is correct testExtract("aktualizuj můj kalendář na ranní schůzku s julius" + " v březnu 4", "2018-03-04 08:00:00", "aktualizuj můj kalendář schůzku s julius") testExtract("připomeň mi abych zavolal mámě další úterý", "2017-07-04 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě 3 týdny", "2017-07-18 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 8 týdny", "2017-08-22 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 8 týdny a 2 dny", "2017-08-24 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 4 dny", "2017-07-01 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 3 měsíce", "2017-09-27 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 2 roky a 2 dny", "2019-06-29 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě další týden", "2017-07-04 00:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 10am v Sobota", "2017-07-01 10:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 10am tato Sobota", "2017-07-01 10:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 10 další Sobota", "2017-07-01 10:00:00", "připomeň mi abych zavolal mámě") testExtract("připomeň mi abych zavolal mámě v 10am další Sobota", "2017-07-01 10:00:00", "připomeň mi abych zavolal mámě") # test yesterday testExtract("jaký den byl včera", "2017-06-26 00:00:00", "jaký den byl") testExtract("jaký den byl den před včera", "2017-06-25 00:00:00", "jaký den byl") testExtract("měl jsem večeři včera v 6", "2017-06-26 06:00:00", "měl jsem večeři") testExtract("měl jsem večeři včera v 6 am", "2017-06-26 06:00:00", "měl jsem večeři") testExtract("měl jsem večeři včera v 6 pm", "2017-06-26 18:00:00", "měl jsem večeři") # Below two tests, ensure that time is picked # even if no am/pm is specified # in case of weekdays/tonight testExtract("nastav budík na 9 o víkendech", "2017-06-27 21:00:00", "nastav budík víkendech") testExtract("na 8 dnes večer", "2017-06-27 20:00:00", "") testExtract("na 8:30pm dnes večer", "2017-06-27 20:30:00", "") # Tests a time with ':' & without am/pm testExtract("nastav budík na dnes večer 9:30", "2017-06-27 21:30:00", "nastav budík") testExtract("nastav budík na 9:00 na dnes večer", "2017-06-27 21:00:00", "nastav budík") # Check if it picks intent irrespective of correctness testExtract("nastav budík na 9 hodin dnes večer", "2017-06-27 21:00:00", "nastav budík") testExtract("připomeň mi hru dnes v noci v 11:30", "2017-06-27 23:30:00", "připomeň mi hru") testExtract("nastav budík v 7:30 o výkendech", "2017-06-27 19:30:00", "nastav budík o výkendech") # "# days " testExtract("mé narozeniny jsou 2 dny od dnes", "2017-06-29 00:00:00", "mé narozeniny jsou") testExtract("mé narozeniny jsou 2 dny po dnes", "2017-06-29 00:00:00", "mé narozeniny jsou") testExtract("mé narozeniny jsou 2 dny od zítra", "2017-06-30 00:00:00", "mé narozeniny jsou") testExtract("mé narozeniny jsou 2 dny od zítra", "2017-06-30 00:00:00", "mé narozeniny jsou") testExtract("připomeň mi abych zavolal mámě v 10am 2 dny po další Sobota", "2017-07-10 10:00:00", "připomeň mi abych zavolal mámě") testExtract("mé narozeniny jsou 2 dny od včera", "2017-06-28 00:00:00", "mé narozeniny jsou") testExtract("mé narozeniny jsou 2 dny po včera", "2017-06-28 00:00:00", "mé narozeniny jsou") # "# days ago>" testExtract("mé narozeniny byly před 1 den", "2017-06-26 00:00:00", "mé narozeniny byly") testExtract("mé narozeniny byly před 2 dny", "2017-06-25 00:00:00", "mé narozeniny byly") testExtract("mé narozeniny byly před 3 dny", "2017-06-24 00:00:00", "mé narozeniny byly") testExtract("mé narozeniny byly před 4 dny", "2017-06-23 00:00:00", "mé narozeniny byly") # TODO this test is imperfect due to "tonight" in the reminder, but let is pass since the date is correct testExtract("sejdeme se dnes v noci", "2017-06-27 22:00:00", "sejdeme se noci") # TODO this test is imperfect due to "at night" in the reminder, but let is pass since the date is correct testExtract("sejdeme se později v noci", "2017-06-27 22:00:00", "sejdeme se později v noci") # TODO this test is imperfect due to "night" in the reminder, but let is pass since the date is correct testExtract("Jaké bude počasí zítra v noci", "2017-06-28 22:00:00", "jaké bude počasí v noci") # TODO this test is imperfect due to "night" in the reminder, but let is pass since the date is correct testExtract("jaké bude počasí příští úterý v noci", "2017-07-04 22:00:00", "jaké bude počasí v noci") def test_extract_ambiguous_time_cs(self): morning = datetime(2017, 6, 27, 8, 1, 2, tzinfo=default_timezone()) večer = datetime(2017, 6, 27, 20, 1, 2, tzinfo=default_timezone()) noonish = datetime(2017, 6, 27, 12, 1, 2, tzinfo=default_timezone()) self.assertEqual( extract_datetime('krmení ryb'), None) self.assertEqual( extract_datetime('den'), None) self.assertEqual( extract_datetime('týden'), None) self.assertEqual( extract_datetime('měsíc'), None) self.assertEqual( extract_datetime('rok'), None) self.assertEqual( extract_datetime(' '), None) self.assertEqual( extract_datetime('nakrmit ryby v 10 hodin', morning)[0], datetime(2017, 6, 27, 10, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('nakrmit ryby v 10 hodin', noonish)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('nakrmit ryby v 10 hodin', večer)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) """ In Czech is May and may have different format def test_extract_date_with_may_I_cs(self): now = datetime(2019, 7, 4, 8, 1, 2) may_date = datetime(2019, 5, 2, 10, 11, 20) self.assertEqual( extract_datetime('Můžu vědět jaký je to čas zítra', now)[0], datetime(2019, 7, 5, 0, 0, 0)) self.assertEqual( extract_datetime('Můžu vědět kdy je 10 hodin', now)[0], datetime(2019, 7, 4, 10, 0, 0)) self.assertEqual( extract_datetime('24. můžu chtít připomenutí', may_date)[0], datetime(2019, 5, 24, 0, 0, 0)) """ def test_extract_relativedatetime_cs(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("sejdeme se za 5 minut", "2017-06-27 10:06:02", "sejdeme se") testExtract("sejdeme se za 5minut", "2017-06-27 10:06:02", "sejdeme se") testExtract("sejdeme se za 5 sekund", "2017-06-27 10:01:07", "sejdeme se") testExtract("sejdeme se za 1 hodinu", "2017-06-27 11:01:02", "sejdeme se") testExtract("sejdeme se za 2 hodiny", "2017-06-27 12:01:02", "sejdeme se") print("TODO") # Need better normaliting procedure for czech inflexion # testExtract("sejdeme se za 2hodiny", # "2017-06-27 12:01:02", "sejdeme se") testExtract("sejdeme se za 1 minutu", "2017-06-27 10:02:02", "sejdeme se") testExtract("sejdeme se za 1 sekundu", "2017-06-27 10:01:03", "sejdeme se") testExtract("sejdeme se za 5sekund", "2017-06-27 10:01:07", "sejdeme se") def test_spaces(self): self.assertEqual(normalize(" tohle je test"), "tohle je test") self.assertEqual(normalize(" tohle je test "), "tohle je test") self.assertEqual(normalize(" tohle je jedna test"), "tohle je 1 test") def test_numbers(self): self.assertEqual(normalize("tohle je jedna dva tři test"), "tohle je 1 2 3 test") self.assertEqual(normalize(" to je čtyři pět šest test"), "to je 4 5 6 test") self.assertEqual(normalize("to je sedum osum devět test"), "to je 7 8 9 test") self.assertEqual(normalize("to je sedm osm devět test"), "to je 7 8 9 test") self.assertEqual(normalize("tohle je deset jedenáct dvanáct test"), "tohle je 10 11 12 test") self.assertEqual(normalize("tohle je třináct čtrnáct test"), "tohle je 13 14 test") self.assertEqual(normalize("tohle je patnáct šestnáct sedmnáct"), "tohle je 15 16 17") self.assertEqual(normalize("tohle je osmnáct devatenáct dvacet"), "tohle je 18 19 20") self.assertEqual(normalize("tohle je jedna devatenáct dvacet dva"), "tohle je 1 19 20 2") self.assertEqual(normalize("tohle je jedna sto"), "tohle je 1 sto") self.assertEqual(normalize("tohle je jedna dva dvacet dva"), "tohle je 1 2 20 2") self.assertEqual(normalize("tohle je jedna a půl"), "tohle je 1 a půl") self.assertEqual(normalize("tohle je jedna a půl a pět šest"), "tohle je 1 a půl a 5 6") def test_multiple_numbers(self): self.assertEqual(extract_numbers("tohle je jedna dva tři test"), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers("to je čtyři pět šest test"), [4.0, 5.0, 6.0]) self.assertEqual(extract_numbers("tohle je deset jedenáct dvanáct test"), [10.0, 11.0, 12.0]) self.assertEqual(extract_numbers("tohle je jedna dvacet jedna test"), [1.0, 21.0]) self.assertEqual(extract_numbers("1 pes, sedm prasat, macdonald měl " "farmu, 3 krát 5 makaréna"), [1, 7, 3, 5]) self.assertEqual(extract_numbers("dva piva pro dva medvědy"), [2.0, 2.0]) self.assertEqual(extract_numbers("dvacet 20 dvacet"), [20, 20, 20]) self.assertEqual(extract_numbers("dvacet 20 22"), [20.0, 20.0, 22.0]) self.assertEqual(extract_numbers("dvacet dvacet dva dvacet"), [20, 22, 20]) self.assertEqual(extract_numbers("dvacet 2"), [22.0]) self.assertEqual(extract_numbers("dvacet 20 dvacet 2"), [20, 20, 22]) self.assertEqual(extract_numbers("třetina jedna"), [1 / 3, 1]) self.assertEqual(extract_numbers("třetí", ordinals=True), [3]) self.assertEqual(extract_numbers("šest trillion", short_scale=True), [6e12]) self.assertEqual(extract_numbers("šest trilion", short_scale=False), [6e18]) self.assertEqual(extract_numbers("dvě prasátka a šest trillion bakterií", short_scale=True), [2, 6e12]) self.assertEqual(extract_numbers("dvě prasátka a šest trilion bakterií", short_scale=False), [2, 6e18]) self.assertEqual(extract_numbers("třicátý druhý nebo první", ordinals=True), [32, 1]) self.assertEqual(extract_numbers("tohle je sedm osm devět a" " půl test"), [7.0, 8.0, 9.5]) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_da.py000066400000000000000000000240461426211343400222050ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language('da-dk') set_default_lang('da') def tearDownModule(): unload_language('da') class TestNormalize(unittest.TestCase): def test_articles(self): self.assertEqual( normalize("dette er en test", lang="da-dk", remove_articles=True), "dette er 1 test") self.assertEqual( normalize("og endnu en test", lang="da-dk", remove_articles=True), "og endnu 1 test") self.assertEqual(normalize("dette er en extra-test", lang="da-dk", remove_articles=False), "dette er 1 extra-test") def test_extract_number(self): self.assertEqual(extract_number("dette er den første test", lang="da-dk"), 1) # self.assertEqual(extract_number("dette er den 1. test", # lang="da-dk"), # 1) self.assertEqual(extract_number("dette er den anden test", lang="da-dk"), 2) # self.assertEqual(extract_number("dette er den 2. test", # lang="da-dk"), # 2) self.assertEqual( extract_number("dette er den tredie test", lang="da-dk"), 3) self.assertEqual( extract_number("dette er test nummer fire", lang="da-dk"), 4) self.assertEqual( extract_number("dette er test nummer Fire", lang="da-dk"), 4) self.assertEqual( extract_number("en trediedel af en kop", lang="da-dk"), 1.0 / 3.0) self.assertEqual(extract_number("tre kopper", lang="da-dk"), 3) self.assertEqual(extract_number("1/3 kop", lang="da-dk"), 1.0 / 3.0) # self.assertEqual(extract_number("en fjerdelel kop", lang="da-dk"), # 0.25) # self.assertEqual(extract_number("1/4 kop", lang="da-dk"), 0.25) # self.assertEqual(extract_number("kvart kop", lang="da-dk"), 0.25) # self.assertEqual(extract_number("2/3 kop", lang="da-dk"), 2.0 / 3.0) # self.assertEqual(extract_number("3/4 kop", lang="da-dk"), 3.0 / 4.0) # self.assertEqual(extract_number("1 og 3/4 kop", lang="da-dk"), 1.75) # self.assertEqual(extract_number("1 og en halv kop", lang="da-dk"), # 1.5) # self.assertEqual( # extract_number("en og en halv kop", lang="da-dk"), 1.5) # self.assertEqual(extract_number("tre fjerdele kop", lang="da-dk"), # 3.0 / 4.0) # self.assertEqual(extract_number("tre fjerdedel kop", lang="da-dk"), # 3.0 / 4.0) def test_extractdatetime_da(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang="da-dk", ) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("sæt frisøraftale på fredag", "2017-06-30 00:00:00", "sæt frisøraftale") testExtract("hvordan er vejret i overmorgen?", "2017-06-29 00:00:00", "hvordan er vejret") testExtract("mind mig om det 10:45 i aften", "2017-06-27 22:45:00", "mind mig") testExtract("hvordan er vejret fredag om morgenen", "2017-06-30 08:00:00", "hvordan er vejret") # testExtract("hvordan er vejret i morgen", # "2017-06-28 00:00:00", "hvordan er vejret") testExtract( "påmind mig at ringe min mor om 8 uger og 2 dage", "2017-08-24 00:00:00", "påmind mig at ringe min mor") testExtract("afspil rick astley musik 2 dage fra fredag", "2017-07-02 00:00:00", "afspil rick astley musik") testExtract("start inversionen 3:45 pm på torsdag", "2017-06-29 15:45:00", "start inversionen") testExtract("på mandag bestil kager fra bageren", "2017-07-03 00:00:00", "bestil kager fra bageren") testExtract("spil happy birthday musik om 5 år fra nu", "2022-06-27 00:00:00", "spil happy birthday musik") testExtract("skype mor klokken 12:45 pm næste torsdag", "2017-07-06 12:45:00", "skype mor") testExtract("hvordan er vejret på næste torsdag", "2017-07-06 00:00:00", "hvordan er vejret") testExtract("hvordan er vejret næste fredag morgen", "2017-07-07 08:00:00", "hvordan er vejret") testExtract("hvordan er vejret næste fredag aften", "2017-07-07 19:00:00", "hvordan er vejret") testExtract("hvordan er vejret næste fredag eftermiddag", "2017-07-07 15:00:00", "hvordan er vejret") testExtract("påmind mig at ringe min mor den tredie august", "2017-08-03 00:00:00", "påmind mig at ringe min mor") testExtract("køb fyrværkeri den enogtyvende juli", "2017-07-21 00:00:00", "køb fyrværkeri") testExtract("hvordan er vejret 2 uger fra næste fredag", "2017-07-21 00:00:00", "hvordan er vejret") testExtract("hvordan er vejret på onsdag klokken 07:00", "2017-06-28 07:00:00", "hvordan er vejret") testExtract("hvordan er vejret på onsdag klokken 7", "2017-06-28 07:00:00", "hvordan er vejret") testExtract("marker en termin klokken 12:45 på næste torsdag", "2017-07-06 12:45:00", "marker en termin") testExtract("hvordan er vejret på torsdag", "2017-06-29 00:00:00", "hvordan er vejret") testExtract("forbered et besøg på 2 uger og 6 dage fra på lørdag", "2017-07-21 00:00:00", "forbered et besøg") testExtract("begynd invasionen klokken 03:45 på torsdag", "2017-06-29 03:45:00", "begynd invasionen") testExtract("begynd invasionen klokken 3 om natten på torsdag", "2017-06-29 03:00:00", "begynd invasionen") testExtract("begynd invasionen klokken 8 am på torsdag", "2017-06-29 08:00:00", "begynd invasionen") testExtract("start festen klokken 8 om aftenen på torsdag", "2017-06-29 20:00:00", "start festen") testExtract("start invasionen klokken 8 om aftenen på torsdag", "2017-06-29 20:00:00", "start invasionen") testExtract("start invasionen på torsdag ved middag", "2017-06-29 12:00:00", "start invasionen") # testExtract("start invasionen på torsdag om eftermiddagen", # "2017-06-29 00:00:00", "start invasionen") testExtract("start invasionen på torsdag klokken 5", "2017-06-29 05:00:00", "start invasionen") testExtract("husk at vågne op om 4 år", "2021-06-27 00:00:00", "husk at vågne op") testExtract("husk at vågne op om 4 år og 4 dage", "2021-07-01 00:00:00", "husk at vågne op") # testExtract("hvordan er vejret om 3 dage fra i morgen", # "2017-07-01 00:00:00", "hvordan er vejret") # testExtract("tredie december", # "2017-12-03 00:00:00", "") # testExtract("lad os mødes klokken 8:00 om aftenen", # "2017-06-27 20:00:00", "lad os mødes") def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" with self.assertWarns(UserWarning): self.assertEqual(extract_datetime('ingen tid', lang='da-da'), None) def test_extractdatetime_default_da(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime("lad os mødes på fredag klokken 9 om morgenen", anchor, lang='da-dk', default_time=default) self.assertEqual(default, res[0].time()) def test_spaces(self): self.assertEqual(normalize(" dette er en test", lang="da-dk"), "dette er 1 test") self.assertEqual(normalize(" dette er en test ", lang="da-dk"), "dette er 1 test") def test_numbers(self): self.assertEqual( normalize("dette er en to tre test", lang="da-dk"), "dette er 1 2 3 test") self.assertEqual( normalize("dette er fire fem seks test", lang="da-dk"), "dette er 4 5 6 test") self.assertEqual( normalize("dette er syv otte ni test", lang="da-dk"), "dette er 7 8 9 test") self.assertEqual( normalize("dette er ti elve tolv test", lang="da-dk"), "dette er 10 11 12 test") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_de.py000066400000000000000000000324521426211343400222110ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time, timedelta from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number from lingua_franca.parse import normalize def setUpModule(): load_language("de-de") set_default_lang("de") def tearDownModule(): unload_language("de") class TestNormalize(unittest.TestCase): def test_articles(self): self.assertEqual( normalize("dies ist der test", lang="de-de", remove_articles=True), "dies ist test") self.assertEqual( normalize("und noch ein Test", lang="de-de", remove_articles=True), "und noch 1 Test") self.assertEqual(normalize("dies ist der Extra-Test", lang="de-de", remove_articles=False), "dies ist der Extra-Test") def test_extract_number(self): self.assertEqual(extract_number("dies ist der 1. Test", lang="de-de"), 1) self.assertEqual(extract_number("dies ist der erste Test", lang="de-de"), 1) self.assertEqual(extract_number("dies ist 2 Test", lang="de-de"), 2) self.assertEqual(extract_number("dies ist zweiter Test", lang="de-de"), 2) self.assertEqual( extract_number("dies ist der dritte Test", lang="de-de"), 3) self.assertEqual( extract_number("dies ist der Test Nummer 4", lang="de-de"), 4) self.assertEqual(extract_number("ein drittel einer Tasse", lang="de-de"), 1.0 / 3.0) self.assertEqual(extract_number("drei Tassen", lang="de-de"), 3) self.assertEqual(extract_number("1/3 Tasse", lang="de-de"), 1.0 / 3.0) self.assertEqual(extract_number("eine viertel Tasse", lang="de-de"), 0.25) self.assertEqual(extract_number("1/4 Tasse", lang="de-de"), 0.25) self.assertEqual(extract_number("viertel Tasse", lang="de-de"), 0.25) self.assertEqual(extract_number("2/3 Tasse", lang="de-de"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 Tasse", lang="de-de"), 3.0 / 4.0) self.assertEqual(extract_number("1 und 3/4 Tassen", lang="de-de"), 1.75) self.assertEqual(extract_number("1 Tasse und eine halbe", lang="de-de"), 1.5) self.assertEqual( extract_number("eine Tasse und eine halbe", lang="de-de"), 1.5) self.assertEqual( extract_number("eine und eine halbe Tasse", lang="de-de"), 1.5) self.assertEqual(extract_number("ein und ein halb Tassen", lang="de-de"), 1.5) self.assertEqual(extract_number("drei Viertel Tasse", lang="de-de"), 3.0 / 4.0) self.assertEqual(extract_number("drei Viertel Tassen", lang="de-de"), 3.0 / 4.0) self.assertEqual(extract_number("Drei Viertel Tassen", lang="de-de"), 3.0 / 4.0) def test_extractdatetime_de(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0) [extractedDate, leftover] = extract_datetime(text, date, lang="de-de", ) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("setze den frisörtermin auf 5 tage von heute", "2017-07-02 00:00:00", "setze frisörtermin") testExtract("wie ist das wetter übermorgen?", "2017-06-29 00:00:00", "wie ist das wetter") testExtract("erinnere mich um 10:45 abends", "2017-06-27 22:45:00", "erinnere mich") testExtract("was ist das Wetter am freitag morgen", "2017-06-30 08:00:00", "was ist das wetter") testExtract("wie ist das wetter morgen", "2017-06-28 00:00:00", "wie ist das wetter") testExtract( "erinnere mich meine mutter anzurufen in 8 Wochen und 2 Tagen", "2017-08-24 00:00:00", "erinnere mich meine mutter anzurufen") testExtract("spiele rick astley musik 2 tage von freitag", "2017-07-02 00:00:00", "spiele rick astley musik") testExtract("starte die invasion um 3:45 pm am Donnerstag", "2017-06-29 15:45:00", "starte die invasion") testExtract("am montag bestelle kuchen von der bäckerei", "2017-07-03 00:00:00", "bestelle kuchen von bäckerei") testExtract("spiele happy birthday musik 5 jahre von heute", "2022-06-27 00:00:00", "spiele happy birthday musik") testExtract("skype mama um 12:45 pm nächsten Donnerstag", "2017-07-06 12:45:00", "skype mama") testExtract("wie ist das wetter nächsten donnerstag?", "2017-07-06 00:00:00", "wie ist das wetter") testExtract("wie ist das Wetter nächsten Freitag morgen", "2017-07-07 08:00:00", "wie ist das wetter") testExtract("wie ist das wetter nächsten freitag abend", "2017-07-07 19:00:00", "wie ist das wetter") testExtract("wie ist das wetter nächsten freitag nachmittag", "2017-07-07 15:00:00", "wie ist das wetter") testExtract("erinnere mich mama anzurufen am dritten august", "2017-08-03 00:00:00", "erinnere mich mama anzurufen") testExtract("kaufe feuerwerk am einundzwanzigsten juli", "2017-07-21 00:00:00", "kaufe feuerwerk") testExtract("wie ist das wetter 2 wochen ab nächsten freitag", "2017-07-21 00:00:00", "wie ist das wetter") testExtract("wie ist das wetter am mittwoch um 07:00", "2017-06-28 07:00:00", "wie ist das wetter") testExtract("wie ist das wetter am mittwoch um 7 uhr", "2017-06-28 07:00:00", "wie ist das wetter") testExtract("Mache einen Termin um 12:45 pm nächsten donnerstag", "2017-07-06 12:45:00", "mache einen termin") testExtract("wie ist das wetter an diesem donnerstag?", "2017-06-29 00:00:00", "wie ist das wetter") testExtract("vereinbare den besuch für 2 wochen und 6 tage ab samstag", "2017-07-21 00:00:00", "vereinbare besuch") testExtract("beginne die invasion um 03:45 am donnerstag", "2017-06-29 03:45:00", "beginne die invasion") testExtract("beginne die invasion um 3 uhr nachts am donnerstag", "2017-06-29 03:00:00", "beginne die invasion") testExtract("beginne die invasion um 8 Uhr am donnerstag", "2017-06-29 08:00:00", "beginne die invasion") testExtract("starte die party um 8 uhr abends am donnerstag", "2017-06-29 20:00:00", "starte die party") testExtract("starte die invasion um 8 abends am donnerstag", "2017-06-29 20:00:00", "starte die invasion") testExtract("starte die invasion am donnerstag um mittag", "2017-06-29 12:00:00", "starte die invasion") testExtract("starte die invasion am donnerstag um mitternacht", "2017-06-29 00:00:00", "starte die invasion") testExtract("starte die invasion am donnerstag um 5 uhr", "2017-06-29 05:00:00", "starte die invasion") testExtract("erinnere mich aufzuwachen in 4 jahren", "2021-06-27 00:00:00", "erinnere mich aufzuwachen") testExtract("erinnere mich aufzuwachen in 4 jahren und 4 tagen", "2021-07-01 00:00:00", "erinnere mich aufzuwachen") testExtract("wie ist das wetter 3 Tage nach morgen?", "2017-07-01 00:00:00", "wie ist das wetter") testExtract("dritter dezember", "2017-12-03 00:00:00", "") testExtract("lass uns treffen um 8:00 abends", "2017-06-27 20:00:00", "lass uns treffen") def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" self.assertEqual(extract_datetime('kein zeit', lang='de-de'), None) def test_extractdatetime_default_de(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime("lass uns treffen am freitag", anchor, lang='de-de', default_time=default) self.assertEqual(default, res[0].time()) def test_extract_duration_de(self): self.assertEqual(extract_duration("10 sekunden", lang="de-de"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 minuten", lang="de-de"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 stunden", lang="de-de"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 tage", lang="de-de"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 wochen", lang="de-de"), (timedelta(weeks=25), "")) # TODO no german text to number parsing yet #self.assertEqual(extract_duration("sieben stunden"), # (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 sekunden", lang="de-de"), (timedelta(seconds=7.5), "")) #self.assertEqual(extract_duration("eight and a half days thirty" # " nine seconds"), # (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("starte timer für 30 minuten", lang="de-de"), (timedelta(minutes=30), "starte timer für")) #self.assertEqual(extract_duration("Four and a half minutes until" # " sunset"), # (timedelta(minutes=4.5), "until sunset")) #self.assertEqual(extract_duration("Nineteen minutes past the hour"), # (timedelta(minutes=19), "past the hour")) self.assertEqual(extract_duration("weck mich in 3 wochen, " " 497 tage und" " 391.6 sekunden", lang="de-de"), (timedelta(weeks=3, days=497, seconds=391.6), "weck mich in , und")) #self.assertEqual(extract_duration("The movie is one hour, fifty seven" # " and a half minutes long"), # (timedelta(hours=1, minutes=57.5), # "the movie is , long")) self.assertEqual(extract_duration("10-sekunden", lang="de-de"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-minuten", lang="de-de"), (timedelta(minutes=5), "")) def test_spaces(self): self.assertEqual(normalize(" dies ist ein test", lang="de-de"), "dies ist 1 test") self.assertEqual(normalize(" dies ist ein test ", lang="de-de"), "dies ist 1 test") def test_numbers(self): self.assertEqual( normalize("dies ist eins zwei drei test", lang="de-de"), "dies ist 1 2 3 test") self.assertEqual( normalize("es ist vier fünf sechs test", lang="de-de"), "es ist 4 5 6 test") self.assertEqual( normalize("es ist sieben acht neun test", lang="de-de"), "es ist 7 8 9 test") self.assertEqual( normalize("es ist sieben acht neun test", lang="de-de"), "es ist 7 8 9 test") self.assertEqual( normalize("dies ist zehn elf zwölf test", lang="de-de"), "dies ist 10 11 12 test") self.assertEqual( normalize("dies ist dreizehn vierzehn test", lang="de-de"), "dies ist 13 14 test") self.assertEqual( normalize("dies ist fünfzehn sechzehn siebzehn", lang="de-de"), "dies ist 15 16 17") self.assertEqual( normalize("dies ist achtzehn neunzehn zwanzig", lang="de-de"), "dies ist 18 19 20") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_es.py000066400000000000000000000271041426211343400222260ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime import unittest from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import (normalize, extract_numbers, extract_number, extract_datetime) from lingua_franca.lang.parse_es import extract_datetime_es, is_fractional_es from lingua_franca.time import default_timezone def setUpModule(): load_language('es-es') set_default_lang('es') def tearDownModule(): unload_language('es') class TestNormalize(unittest.TestCase): """ Test cases for Spanish parsing """ def test_articles_es(self): self.assertEqual(normalize("esta es la prueba", lang="es", remove_articles=True), "esta es prueba") self.assertEqual(normalize("y otra prueba", lang="es", remove_articles=True), "y otra prueba") def test_numbers_es(self): self.assertEqual(normalize("esto es un uno una", lang="es"), "esto es 1 1 1") self.assertEqual(normalize("esto es dos tres prueba", lang="es"), "esto es 2 3 prueba") self.assertEqual(normalize("esto es cuatro cinco seis prueba", lang="es"), "esto es 4 5 6 prueba") self.assertEqual(normalize("siete más ocho más nueve", lang="es"), "7 más 8 más 9") self.assertEqual(normalize("diez once doce trece catorce quince", lang="es"), "10 11 12 13 14 15") self.assertEqual(normalize("dieciséis diecisiete", lang="es"), "16 17") self.assertEqual(normalize("dieciocho diecinueve", lang="es"), "18 19") self.assertEqual(normalize("veinte treinta cuarenta", lang="es"), "20 30 40") self.assertEqual(normalize("treinta y dos caballos", lang="es"), "32 caballos") self.assertEqual(normalize("cien caballos", lang="es"), "100 caballos") self.assertEqual(normalize("ciento once caballos", lang="es"), "111 caballos") self.assertEqual(normalize("había cuatrocientas una vacas", lang="es"), "había 401 vacas") self.assertEqual(normalize("dos mil", lang="es"), "2000") self.assertEqual(normalize("dos mil trescientas cuarenta y cinco", lang="es"), "2345") self.assertEqual(normalize( "ciento veintitrés mil cuatrocientas cincuenta y seis", lang="es"), "123456") self.assertEqual(normalize( "quinientas veinticinco mil", lang="es"), "525000") self.assertEqual(normalize( "novecientos noventa y nueve mil novecientos noventa y nueve", lang="es"), "999999") def test_extract_number_es(self): self.assertEqual(sorted(extract_numbers( "1 7 cuatro catorce ocho 157", lang='es')), [1, 4, 7, 8, 14, 157]) self.assertEqual(sorted(extract_numbers( "1 7 cuatro albuquerque naranja John Doe catorce ocho 157", lang='es')), [1, 4, 7, 8, 14, 157]) self.assertEqual(extract_number("seis punto dos", lang='es'), 6.2) self.assertEqual(extract_number("seis punto Dos", lang='es'), 6.2) self.assertEqual(extract_number("seis coma dos", lang='es'), 6.2) self.assertEqual(extract_numbers("un medio", lang='es'), [0.5]) self.assertEqual(extract_number("cuarto", lang='es'), 0.25) self.assertEqual(extract_number("2.0", lang='es'), 2.0) self.assertEqual(extract_number("1/4", lang='es'), 0.25) self.assertEqual(extract_number("dos y media", lang='es'), 2.5) self.assertEqual(extract_number( "catorce y milésima", lang='es'), 14.001) self.assertEqual(extract_number("dos punto cero dos", lang='es'), 2.02) def test_isFraction_es(self): self.assertEqual(is_fractional_es("vigésimo"), 1.0 / 20) self.assertEqual(is_fractional_es("vigésima"), 1.0 / 20) self.assertEqual(is_fractional_es("trigésimo"), 1.0 / 30) self.assertEqual(is_fractional_es("centésima"), 1.0 / 100) self.assertEqual(is_fractional_es("centésimo"), 1.0 / 100) self.assertEqual(is_fractional_es("milésima"), 1.0 / 1000) @unittest.skip("unwritten logic") def test_comma_fraction_logic_es(self): # Logic has not been written to parse "#,#" as "#.#" # English-style decimal numbers work because they just get float(str)ed self.assertEqual(extract_number("2,0", lang='es'), 2.0) class TestDatetime_es(unittest.TestCase): def test_datetime_by_date_es(self): # test currentDate==None _now = datetime.now() relative_year = _now.year if (_now.month == 1 and _now.day < 11) else \ (_now.year + 1) self.assertEqual(extract_datetime_es("11 ene", anchorDate=_now)[0], datetime(relative_year, 1, 11)) # test months self.assertEqual(extract_datetime( "11 ene", lang='es', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 feb", lang='es', anchorDate=datetime(1998, 2, 1))[0], datetime(1998, 2, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 mar", lang='es', anchorDate=datetime(1998, 3, 1))[0], datetime(1998, 3, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 abr", lang='es', anchorDate=datetime(1998, 4, 1))[0], datetime(1998, 4, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 may", lang='es', anchorDate=datetime(1998, 5, 1))[0], datetime(1998, 5, 11, tzinfo=default_timezone())) # there is an issue with the months of june through september (below) # hay un problema con las meses junio hasta septiembre (lea abajo) self.assertEqual(extract_datetime( "11 oct", lang='es', anchorDate=datetime(1998, 10, 1))[0], datetime(1998, 10, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 nov", lang='es', anchorDate=datetime(1998, 11, 1))[0], datetime(1998, 11, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 dic", lang='es', anchorDate=datetime(1998, 12, 1))[0], datetime(1998, 12, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime("", lang='es'), None) # TODO fix bug causing these tests to fail (MycroftAI/mycroft-core#2348) # reparar error de traducción preveniendo las funciones abajo de # retornar correctamente # (escrito con disculpas por un Inglés hablante) # further broken tests are below their respective working tests. @unittest.skip("currently processing these months incorrectly") def test_bugged_output_wastebasket(self): self.assertEqual(extract_datetime( "11 jun", lang='es', anchorDate=datetime(1998, 6, 1))[0], datetime(1998, 6, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 junio", lang='es', anchorDate=datetime(1998, 6, 1))[0], datetime(1998, 6, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 jul", lang='es', anchorDate=datetime(1998, 7, 1))[0], datetime(1998, 7, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 ago", lang='es', anchorDate=datetime(1998, 8, 1))[0], datetime(1998, 8, 11, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "11 sep", lang='es', anchorDate=datetime(1998, 9, 1))[0], datetime(1998, 9, 11, tzinfo=default_timezone())) # It's also failing on years self.assertEqual(extract_datetime( "11 ago 1998", lang='es')[0], datetime(1998, 8, 11, tzinfo=default_timezone())) def test_extract_datetime_relative(self): self.assertEqual(extract_datetime( "esta noche", anchorDate=datetime(1998, 1, 1), lang='es'), [datetime(1998, 1, 1, 21, 0, 0, tzinfo=default_timezone()), 'esta']) self.assertEqual(extract_datetime( "ayer noche", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 31, 21, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "el noche anteayer", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 30, 21, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "el noche ante ante ayer", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 29, 21, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "mañana por la mañana", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1998, 1, 2, 8, tzinfo=default_timezone())) self.assertEqual(extract_datetime( "ayer por la tarde", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 31, 15, tzinfo=default_timezone())) self.assertEqual(extract_datetime("hoy 2 de la mañana", lang='es', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 1, 2, tzinfo=default_timezone())) self.assertEqual(extract_datetime("hoy 2 de la tarde", lang='es', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 1, 14, tzinfo=default_timezone())) def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" self.assertEqual(extract_datetime('no hay tiempo', lang='es-es'), None) @unittest.skip("These phrases are not parsing correctly.") def test_extract_datetime_relative_failing(self): # parses as "morning" and returns 8:00 on anchorDate self.assertEqual(extract_datetime( "mañana", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1998, 1, 2)) # unimplemented logic self.assertEqual(extract_datetime( "anoche", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 31, 21)) self.assertEqual(extract_datetime( "anteanoche", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 30, 21)) self.assertEqual(extract_datetime( "hace tres noches", anchorDate=datetime(1998, 1, 1), lang='es')[0], datetime(1997, 12, 29, 21)) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_eu.py000066400000000000000000000251651426211343400222350ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime import unittest from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import (normalize, extract_numbers, extract_number, extract_datetime) from lingua_franca.lang.parse_eu import is_fractional_eu def setUpModule(): load_language('eu-eu') set_default_lang('eu') def tearDownModule(): unload_language('eu') class TestNormalize(unittest.TestCase): """ Test cases for Euskara parsing """ # TODO: Hau ez dakit behar dugun def test_articles_eu(self): self.assertEqual(normalize("hau da froga", lang="eu", remove_articles=True), "hau da froga") self.assertEqual(normalize("eta hau beste froga", lang="eu", remove_articles=True), "eta hau beste froga") def test_numbers_eu(self): self.assertEqual(normalize("hau da bat", lang="eu"), "hau da 1") self.assertEqual(normalize("hau da bi hiru froga", lang="eu"), "hau da 2 3 froga") self.assertEqual(normalize("hau da lau bost sei froga", lang="eu"), "hau da 4 5 6 froga") self.assertEqual(normalize("zazpi gehi zortzi gehi bederatzi", lang="eu"), "7 gehi 8 gehi 9") self.assertEqual(normalize("hamar hamaika hamabi hamahiru hamalau hamabost", lang="eu"), "10 11 12 13 14 15") self.assertEqual(normalize("hamasei hamazazpi", lang="eu"), "16 17") self.assertEqual(normalize("hemezortzi hemeretzi", lang="eu"), "18 19") self.assertEqual(normalize("hogei hogeita hamar berrogeita bat", lang="eu"), "20 30 41") self.assertEqual(normalize("hogeita hamabi zaldi", lang="eu"), "32 zaldi") self.assertEqual(normalize("ehun zaldi", lang="eu"), "100 zaldi") self.assertEqual(normalize("ehun eta hamaika zaldi", lang="eu"), "111 zaldi") self.assertEqual(normalize("laurehun eta bat behi zeuden", lang="eu"), "401 behi zeuden") self.assertEqual(normalize("bi mila", lang="eu"), "2000") self.assertEqual(normalize("bi mila hirurehun eta berrogeita bost", lang="eu"), "2345") self.assertEqual(normalize( "ehun eta hogeita hiru mila laurehun eta berrogeita hamasei", lang="eu"), "123456") self.assertEqual(normalize( "bostehun eta hogeita bost mila", lang="eu"), "525000") self.assertEqual(normalize( "bederatzirehun eta laurogeita hemeretzi mila bederatzirehun eta laurogeita hemeretzi", lang="eu"), "999999") def test_extract_number_eu(self): self.assertEqual(sorted(extract_numbers( "1 7 lau hamalau zortzi 157", lang='eu')), [1, 4, 7, 8, 14, 157]) self.assertEqual(sorted(extract_numbers( "1 7 lau albuquerque laranja John Doe hamalau zortzi 157", lang='eu')), [1, 4, 7, 8, 14, 157]) self.assertEqual(extract_number("sei puntu bi", lang='eu'), 6.2) self.assertEqual(extract_number("sei puntu Bi", lang='eu'), 6.2) self.assertEqual(extract_number("sei koma bi", lang='eu'), 6.2) self.assertEqual(extract_numbers("erdi bat", lang='eu'), [0.5]) self.assertEqual(extract_number("laurdena", lang='eu'), 0.25) self.assertEqual(extract_number("2.0", lang='eu'), 2.0) self.assertEqual(extract_number("1/4", lang='eu'), 0.25) self.assertEqual(extract_number("bi eta erdi", lang='eu'), 2.5) self.assertEqual(extract_number( "hamalau eta milarena", lang='eu'), 14.001) self.assertEqual(extract_number("bi puntu zero bi", lang='eu'), 2.02) def test_isFraction_eu(self): self.assertEqual(is_fractional_eu("hogeirena"), 1.0 / 20) self.assertEqual(is_fractional_eu("hogeita hamarrena"), 1.0 / 30) self.assertEqual(is_fractional_eu("ehunena"), 1.0 / 100) self.assertEqual(is_fractional_eu("milarena"), 1.0 / 1000) @unittest.skip("unwritten logic") def test_comma_fraction_logic_eu(self): # Logic has not been written to parse "#,#" as "#.#" # English-style decimal numbers work because they just get float(str)ed self.assertEqual(extract_number("2,0", lang='eu'), 2.0) class TestDatetime_eu(unittest.TestCase): def test_datetime_by_date_eu(self): # test currentDate==None _now = datetime.now() relative_year = _now.year if (_now.month == 1 and _now.day < 11) else \ (_now.year + 1) # test months self.assertEqual(extract_datetime( "11 urt", lang='eu', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 11)) self.assertEqual(extract_datetime( "11 ots", lang='eu', anchorDate=datetime(1998, 2, 1))[0], datetime(1998, 2, 11)) self.assertEqual(extract_datetime( "11 mar", lang='eu', anchorDate=datetime(1998, 3, 1))[0], datetime(1998, 3, 11)) self.assertEqual(extract_datetime( "11 api", lang='eu', anchorDate=datetime(1998, 4, 1))[0], datetime(1998, 4, 11)) self.assertEqual(extract_datetime( "11 mai", lang='eu', anchorDate=datetime(1998, 5, 1))[0], datetime(1998, 5, 11)) # there is an issue with the months of june through september (below) # hay un problema con las meses junio hasta septiembre (lea abajo) self.assertEqual(extract_datetime( "11 urr", lang='eu', anchorDate=datetime(1998, 10, 1))[0], datetime(1998, 10, 11)) self.assertEqual(extract_datetime( "11 aza", lang='eu', anchorDate=datetime(1998, 11, 1))[0], datetime(1998, 11, 11)) self.assertEqual(extract_datetime( "11 abe", lang='eu', anchorDate=datetime(1998, 12, 1))[0], datetime(1998, 12, 11)) self.assertEqual(extract_datetime("", lang='eu'), None) # TODO fix bug causing these tests to fail (MycroftAI/mycroft-core#2348) # reparar error de traducción preveniendo las funciones abajo de # retornar correctamente # (escrito con disculpas por un Inglés hablante) # further broken tests are below their respective working tests. @unittest.skip("currently processing these months incorrectly") def test_bugged_output_wastebasket(self): self.assertEqual(extract_datetime( "11 eka", lang='eu', anchorDate=datetime(1998, 6, 1))[0], datetime(1998, 6, 11)) self.assertEqual(extract_datetime( "11 ekaina", lang='eu', anchorDate=datetime(1998, 6, 1))[0], datetime(1998, 6, 11)) self.assertEqual(extract_datetime( "11 uztaila", lang='eu', anchorDate=datetime(1998, 7, 1))[0], datetime(1998, 7, 11)) self.assertEqual(extract_datetime( "11 abu", lang='eu', anchorDate=datetime(1998, 8, 1))[0], datetime(1998, 8, 11)) self.assertEqual(extract_datetime( "11 ira", lang='eu', anchorDate=datetime(1998, 9, 1))[0], datetime(1998, 9, 11)) # It's also failing on years self.assertEqual(extract_datetime( "11 abu 1998", lang='eu')[0], datetime(1998, 8, 11)) def test_extract_datetime_relative(self): self.assertEqual(extract_datetime( "gaurko gaua", anchorDate=datetime(1998, 1, 1), lang='eu'), [datetime(1998, 1, 1, 21, 0, 0), '']) self.assertEqual(extract_datetime( "gau honetan", anchorDate=datetime(1998, 1, 1), lang='eu'), [datetime(1998, 1, 1, 21, 0, 0), 'honetan']) self.assertEqual(extract_datetime( "atzoko gaua", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 31, 21)) self.assertEqual(extract_datetime( "herenegungo gaua", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 30, 21)) self.assertEqual(extract_datetime( "duela 3 eguneko gaua", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 29, 21)) self.assertEqual(extract_datetime( "biharko goiza", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1998, 1, 2, 8)) self.assertEqual(extract_datetime( "atzoko arratsaldea", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 31, 15)) self.assertEqual(extract_datetime( "duela 2 egun", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 30)) self.assertEqual(extract_datetime("gaurko goizeko 2", lang='eu', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 1, 2)) self.assertEqual(extract_datetime("gaurko arratsaldeko 2", lang='eu', anchorDate=datetime(1998, 1, 1))[0], datetime(1998, 1, 1, 14)) self.assertEqual(extract_datetime( "datorren urtea", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1999, 1, 1)) def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" self.assertEqual(extract_datetime('ez dago denborarik', lang='eu-eu'), None) @unittest.skip("These phrases are not parsing correctly.") def test_extract_datetime_relative_failing(self): self.assertEqual(extract_datetime( "bart", anchorDate=datetime(1998, 1, 1), lang='eu')[0], datetime(1997, 12, 31, 21)) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_fa.py000066400000000000000000000226731426211343400222130ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, timedelta from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.internal import FunctionNotLocalizedError from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import fuzzy_match from lingua_franca.parse import get_gender from lingua_franca.parse import match_one from lingua_franca.parse import normalize def setUpModule(): # TODO spin off English tests load_language('fa') set_default_lang('fa') def tearDownModule(): unload_language('fa') class TestNormalize(unittest.TestCase): def test_extract_number(self): #self.assertEqual(extract_number("این تست اول است", # ordinals=True), 1) self.assertEqual(extract_number("این تست دو است"), 2) #self.assertEqual(extract_number("این تست دوم است", # ordinals=True), 2) #self.assertEqual(extract_number("این تست سوم است", # ordinals=True), 3.0) #self.assertEqual(extract_number("چهارمی", ordinals=True), 4.0) #self.assertEqual(extract_number("سی و ششمی", ordinals=True), 36.0) self.assertEqual(extract_number("این تست شماره چهار است"), 4) #self.assertEqual(extract_number("یک سوم فنجان"), 1.0 / 3.0) self.assertEqual(extract_number("سه فنجان"), 3) #self.assertEqual(extract_number("۱/۳ فنجان"), 1.0 / 3.0) #self.assertEqual(extract_number("یک چهارم فنجان"), 0.25) #self.assertEqual(extract_number("۱/۴ فنجان"), 0.25) #self.assertEqual(extract_number("دو سوم فنجان"), 2.0 / 3.0) #self.assertEqual(extract_number("سه چهارم فنجان"), 3.0 / 4.0) #self.assertEqual(extract_number("یک و سه چهارم فنجان"), 1.75) #self.assertEqual(extract_number("۱ فنجان و نیم"), 1.5) #self.assertEqual(extract_number("یک فنجان و نیم"), 1.5) self.assertEqual(extract_number("یک و نیم فنجان"), 1.5) self.assertEqual(extract_number("بیست و دو"), 22) #self.assertEqual(extract_number("بیست و دو و سه پنجم"), 22.6) self.assertEqual(extract_number("دویست"), 200) self.assertEqual(extract_number("نه هزار"), 9000) self.assertEqual(extract_number("هزار و پانصد"), 1500) self.assertEqual(extract_number("ششصد و شصت و شش"), 666) self.assertEqual(extract_number("دو میلیون"), 2000000) self.assertEqual(extract_number("دو هزار و هفده"), 2017) self.assertEqual(extract_number("شانزده هزار و صد و پونزده"), 16115) self.assertEqual(extract_number("هجده میلیون و هجده هزار و دویست و هجده"), 18018218) self.assertEqual(extract_number("دو میلیون و پانصد هزار " "تن گوشت یخ زده"), 2500000) def test_extract_duration_en(self): self.assertEqual(extract_duration("10 ثانیه"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 دقیقه"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 ساعت"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 روز"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 هفته"), (timedelta(weeks=25), "")) self.assertEqual(extract_duration("هفت ساعت"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 ثانیه"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("هشت و نیم روز و " "سی و نه ثانیه"), (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("یک تایمر برای نیم ساعت دیگه بزار"), (timedelta(minutes=30), "یک تایمر برای دیگه بزار")) self.assertEqual(extract_duration("چهار و نیم دقیقه تا " "طلوع آفتاب"), (timedelta(minutes=4.5), "تا طلوع آفتاب")) self.assertEqual(extract_duration("این فیلم یک ساعت و پنجاه و هفت و نیم دقیقه " "طول می کشد"), (timedelta(hours=1, minutes=57.5), "این فیلم طول می کشد")) def test_extractdatetime_en(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("الان ساعت اینه", "2017-06-27 13:04:00", "ساعت اینه") testExtract("یک ثانیه دیگه", "2017-06-27 13:04:01", "") testExtract("یک دقیقه دیگه", "2017-06-27 13:05:00", "") testExtract("دو دقیقه دیگه", "2017-06-27 13:06:00", "") testExtract("دو ساعت دیگه", "2017-06-27 15:04:00", "") testExtract("من یک ساعت دیگه می خوامش", "2017-06-27 14:04:00", "من می خوامش") testExtract("1 ثانیه دیگه", "2017-06-27 13:04:01", "") testExtract("2 ثانیه دیگه", "2017-06-27 13:04:02", "") testExtract("یک آلارم برای یک دقیقه بعد بزار", "2017-06-27 13:05:00", "یک آلارم برای بزار") testExtract("یک آلارم برای نیم ساعت دیگه بزار", "2017-06-27 13:34:00", "یک آلارم برای بزار") testExtract("یه آلارم برای پنج روز بعد بزار", "2017-07-02 00:00:00", "یه آلارم برای بزار") testExtract("پس فردا", "2017-06-29 00:00:00", "") testExtract("آب و هوا پس فردا چطوره؟", "2017-06-29 00:00:00", "آب و هوا چطوره؟") #testExtract("ساعت بیست و دو و چهل و پنج دقیقه بهم یادآوری کن", # "2017-06-27 22:45:00", "بهم یادآوری کن") testExtract("هوای جمعه صبح چطوره؟", "2017-06-30 08:00:00", "هوای چطوره؟") testExtract("هوای فردا چطوره؟", "2017-06-28 00:00:00", "هوای چطوره؟") testExtract("هوای امروز بعد از ظهر چطوره؟", "2017-06-27 15:00:00", "هوای چطوره؟") testExtract("یادم بنداز که هشت هفته و دو روز دیگه به مادرم زنگ بزنم", "2017-08-24 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") #testExtract("یادم بنداز که دوازده مرداد به مادرم زنگ بزنم", # "2017-08-03 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") #testExtract("یادم بنداز که ساعت هفت به مادرم زنگ بزنم", # "2017-06-28 07:00:00", "یادم بنداز که به مادرم زنگ بزنم") #testExtract("یادم بنداز که فردا ساعت بیست و دو به مادرم زنگ بزنم", # "2017-06-28 22:00:00", "یادم بنداز که به مادرم زنگ بزنم") # TODO: This test is imperfect due to the "at 7:00" still in the # remainder. But let it pass for now since time is correct def test_multiple_numbers(self): self.assertEqual(extract_numbers("یک دو سه"), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers("ده بیست سه پونزده هزار و شصت و شونزده"), [10, 20, 3, 15060, 16]) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_fr.py000066400000000000000000000622731426211343400222340ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time, timedelta from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.internal import FunctionNotLocalizedError from lingua_franca.time import default_timezone from lingua_franca.parse import get_gender from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number from lingua_franca.parse import normalize def setUpModule(): load_language('fr-fr') set_default_lang('fr') def tearDownModule(): unload_language('fr') class TestNormalize_fr(unittest.TestCase): def test_articles_fr(self): self.assertEqual(normalize("c'est le test", remove_articles=True, lang="fr-fr"), "c'est test") self.assertEqual(normalize("et l'autre test", remove_articles=True, lang="fr-fr"), "et autre test") self.assertEqual(normalize("et la tentative", remove_articles=True, lang="fr-fr"), "et tentative") self.assertEqual(normalize("la dernière tentative", remove_articles=False, lang="fr-fr"), "la dernière tentative") def test_extractnumber_fr(self): self.assertEqual(extract_number("voici le premier test", lang="fr-fr"), 1) self.assertEqual(extract_number("c'est 2 tests", lang="fr-fr"), 2) self.assertEqual(extract_number("voici le second test", lang="fr-fr"), 2) self.assertEqual(extract_number("voici trois tests", lang="fr-fr"), 3) self.assertEqual(extract_number("voici le test numéro 4", lang="fr-fr"), 4) self.assertEqual(extract_number("un tiers de litre", lang="fr-fr"), 1.0 / 3.0) self.assertEqual(extract_number("3 cuillères", lang="fr-fr"), 3) self.assertEqual(extract_number("1/3 de litre", lang="fr-fr"), 1.0 / 3.0) self.assertEqual(extract_number("un quart de bol", lang="fr-fr"), 0.25) self.assertEqual(extract_number("1/4 de verre", lang="fr-fr"), 0.25) self.assertEqual(extract_number("2/3 de bol", lang="fr-fr"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 de bol", lang="fr-fr"), 3.0 / 4.0) self.assertEqual(extract_number("1 et 3/4 de bol", lang="fr-fr"), 1.75) self.assertEqual(extract_number("1 bol et demi", lang="fr-fr"), 1.5) self.assertEqual(extract_number("un bol et demi", lang="fr-fr"), 1.5) self.assertEqual(extract_number("un et demi bols", lang="fr-fr"), 1.5) self.assertEqual(extract_number("un bol et un demi", lang="fr-fr"), 1.5) self.assertEqual(extract_number("trois quarts de bol", lang="fr-fr"), 3.0 / 4.0) self.assertEqual(extract_number("32.2 degrés", lang="fr-fr"), 32.2) self.assertEqual(extract_number("2 virgule 2 cm", lang="fr-fr"), 2.2) self.assertEqual(extract_number("2 virgule 0 2 cm", lang="fr-fr"), 2.02) self.assertEqual(extract_number("ça fait virgule 2 cm", lang="fr-fr"), 0.2) self.assertEqual(extract_number("point du tout", lang="fr-fr"), False) self.assertEqual(extract_number("32.00 secondes", lang="fr-fr"), 32) self.assertEqual(extract_number("mange trente-et-une bougies", lang="fr-fr"), 31) self.assertEqual(extract_number("un trentième", lang="fr-fr"), 1.0 / 30.0) self.assertEqual(extract_number("un centième", lang="fr-fr"), 0.01) self.assertEqual(extract_number("un millième", lang="fr-fr"), 0.001) self.assertEqual(extract_number("un 20e", lang="fr-fr"), 1.0 / 20.0) def test_extractdatetime_fr(self): def extractWithFormat_fr(text): date = datetime(2017, 6, 27, 0, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang="fr-fr") extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract_fr(text, expected_date, expected_leftover): res = extractWithFormat_fr(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) def extractWithFormatDate2_fr(text): date = datetime(2017, 6, 30, 17, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang="fr-fr") extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtractDate2_fr(text, expected_date, expected_leftover): res = extractWithFormatDate2_fr(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) def extractWithFormatNoDate_fr(text): [extractedDate, leftover] = extract_datetime(text, lang="fr-fr") extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtractNoDate_fr(text, expected_date, expected_leftover): res = extractWithFormatNoDate_fr(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract_fr("Planifier l'embûche dans 5 jours", "2017-07-02 00:00:00", "planifier embûche") testExtract_fr("Quel temps fera-t-il après-demain ?", "2017-06-29 00:00:00", "quel temps fera-t-il") testExtract_fr("Met un rappel à 10:45 du soir", "2017-06-27 22:45:00", "met 1 rappel") testExtract_fr("quel temps est prévu pour vendredi matin ?", "2017-06-30 08:00:00", "quel temps est prévu pour") testExtract_fr("quel temps fait-il demain", "2017-06-28 00:00:00", "quel temps fait-il") testExtract_fr("rappelle-moi d'appeler maman dans 8 semaines et" " 2 jours", "2017-08-24 00:00:00", "rappelle-moi appeler maman") testExtract_fr("Jouer des musiques de Beyonce 2 jours après vendredi", "2017-07-02 00:00:00", "jouer musiques beyonce") testExtract_fr("Commencer l'invasion à 15 heures 45 jeudi", "2017-06-29 15:45:00", "commencer invasion") testExtract_fr("Lundi, commander le gâteau à la boulangerie", "2017-07-03 00:00:00", "commander gâteau à boulangerie") testExtract_fr("Jouer la chanson Joyeux anniversaire dans 5 ans", "2022-06-27 00:00:00", "jouer chanson joyeux" " anniversaire") testExtract_fr("Skyper Maman à 12 heures 45 jeudi prochain", "2017-07-06 12:45:00", "skyper maman") testExtract_fr("Quel temps fera-t-il jeudi prochain ?", "2017-07-06 00:00:00", "quel temps fera-t-il") testExtract_fr("Quel temps fera-t-il vendredi matin ?", "2017-06-30 08:00:00", "quel temps fera-t-il") testExtract_fr("Quel temps fera-t-il vendredi soir", "2017-06-30 19:00:00", "quel temps fera-t-il") testExtract_fr("Quel temps fera-t-il vendredi après-midi", "2017-06-30 15:00:00", "quel temps fera-t-il") testExtract_fr("rappelle-moi d'appeler maman le 3 août", "2017-08-03 00:00:00", "rappelle-moi appeler maman") testExtract_fr("Acheter des feux d'artifice pour le 14 juil", "2017-07-14 00:00:00", "acheter feux artifice pour") testExtract_fr("Quel temps fera-t-il 2 semaines après vendredi", "2017-07-14 00:00:00", "quel temps fera-t-il") testExtract_fr("Quel temps fera-t-il mercredi à 7 heures", "2017-06-28 07:00:00", "quel temps fera-t-il") testExtract_fr("Quel temps fera-t-il mercredi à 7 heures", "2017-06-28 07:00:00", "quel temps fera-t-il") testExtract_fr("Prendre rendez-vous à 12:45 jeudi prochain", "2017-07-06 12:45:00", "prendre rendez-vous") testExtract_fr("Quel temps fait-il ce jeudi ?", "2017-06-29 00:00:00", "quel temps fait-il") testExtract_fr("Organiser une visite 2 semaines et 6 jours après" " samedi", "2017-07-21 00:00:00", "organiser 1 visite") testExtract_fr("Commencer l'invasion à 3 heures 45 jeudi", "2017-06-29 03:45:00", "commencer invasion") testExtract_fr("Commencer l'invasion à 20 heures jeudi", "2017-06-29 20:00:00", "commencer invasion") testExtract_fr("Lancer la fête jeudi à 8 heures du soir", "2017-06-29 20:00:00", "lancer fête") testExtract_fr("Commencer l'invasion à 4 heures de l'après-midi jeudi", "2017-06-29 16:00:00", "commencer invasion") testExtract_fr("Commencer l'invasion jeudi à midi", "2017-06-29 12:00:00", "commencer invasion") testExtract_fr("Commencer l'invasion jeudi à minuit", "2017-06-29 00:00:00", "commencer invasion") testExtract_fr("Commencer l'invasion jeudi à dix-sept heures", "2017-06-29 17:00:00", "commencer invasion") testExtract_fr("rappelle-moi de me réveiller dans 4 années", "2021-06-27 00:00:00", "rappelle-moi me réveiller") testExtract_fr("rappelle-moi de me réveiller dans 4 ans et 4 jours", "2021-07-01 00:00:00", "rappelle-moi me réveiller") testExtract_fr("Quel temps fera-t-il 3 jours après demain ?", "2017-07-01 00:00:00", "quel temps fera-t-il") testExtract_fr("3 décembre", "2017-12-03 00:00:00", "") testExtract_fr("retrouvons-nous à 8:00 ce soir", "2017-06-27 20:00:00", "retrouvons-nous") testExtract_fr("retrouvons-nous demain à minuit et demi", "2017-06-28 00:30:00", "retrouvons-nous") testExtract_fr("retrouvons-nous à midi et quart", "2017-06-27 12:15:00", "retrouvons-nous") testExtract_fr("retrouvons-nous à midi moins le quart", "2017-06-27 11:45:00", "retrouvons-nous") testExtract_fr("retrouvons-nous à midi moins dix", "2017-06-27 11:50:00", "retrouvons-nous") testExtract_fr("retrouvons-nous à midi dix", "2017-06-27 12:10:00", "retrouvons-nous") testExtract_fr("retrouvons-nous à minuit moins 23", "2017-06-27 23:37:00", "retrouvons-nous") testExtract_fr("mangeons à 3 heures moins 23 minutes", "2017-06-27 02:37:00", "mangeons") testExtract_fr("mangeons aussi à 4 heures moins le quart du matin", "2017-06-27 03:45:00", "mangeons aussi") testExtract_fr("mangeons encore à minuit moins le quart", "2017-06-27 23:45:00", "mangeons encore") testExtract_fr("buvons à 4 heures et quart", "2017-06-27 04:15:00", "buvons") testExtract_fr("buvons également à 18 heures et demi", "2017-06-27 18:30:00", "buvons également") testExtract_fr("dormons à 20 heures moins le quart", "2017-06-27 19:45:00", "dormons") testExtract_fr("buvons le dernier verre à 10 heures moins 12 du soir", "2017-06-27 21:48:00", "buvons dernier verre") testExtract_fr("s'échapper de l'île à 15h45", "2017-06-27 15:45:00", "s'échapper île") testExtract_fr("s'échapper de l'île à 3h45min de l'après-midi", "2017-06-27 15:45:00", "s'échapper île") testExtract_fr("décale donc ça à 3h48min cet après-midi", "2017-06-27 15:48:00", "décale donc ça") testExtract_fr("construire un bunker à 9h42min du matin", "2017-06-27 09:42:00", "construire 1 bunker") testExtract_fr("ou plutôt à 9h43 ce matin", "2017-06-27 09:43:00", "ou plutôt") testExtract_fr("faire un feu à 8h du soir", "2017-06-27 20:00:00", "faire 1 feu") testExtract_fr("faire la fête jusqu'à 18h cette nuit", "2017-06-27 18:00:00", "faire fête jusqu'à") testExtract_fr("cuver jusqu'à 4h cette nuit", "2017-06-27 04:00:00", "cuver jusqu'à") testExtract_fr("réveille-moi dans 20 secondes aujourd'hui", "2017-06-27 00:00:20", "réveille-moi") testExtract_fr("réveille-moi dans 33 minutes", "2017-06-27 00:33:00", "réveille-moi") testExtract_fr("tais-toi dans 12 heures et 3 minutes", "2017-06-27 12:03:00", "tais-toi") testExtract_fr("ouvre-la dans 1 heure 3", "2017-06-27 01:03:00", "ouvre-la") testExtract_fr("ferme-la dans 1 heure et quart", "2017-06-27 01:15:00", "ferme-la") testExtract_fr("scelle-la dans 1 heure et demi", "2017-06-27 01:30:00", "scelle-la") testExtract_fr("zippe-la dans 2 heures moins 12", "2017-06-27 01:48:00", "zippe-la") testExtract_fr("soude-la dans 3 heures moins le quart", "2017-06-27 02:45:00", "soude-la") testExtract_fr("mange la semaine prochaine", "2017-07-04 00:00:00", "mange") testExtract_fr("bois la semaine dernière", "2017-06-20 00:00:00", "bois") testExtract_fr("mange le mois prochain", "2017-07-27 00:00:00", "mange") testExtract_fr("bois le mois dernier", "2017-05-27 00:00:00", "bois") testExtract_fr("mange l'an prochain", "2018-06-27 00:00:00", "mange") testExtract_fr("bois l'année dernière", "2016-06-27 00:00:00", "bois") testExtract_fr("reviens à lundi dernier", "2017-06-26 00:00:00", "reviens") testExtract_fr("capitule le 8 mai 1945", "1945-05-08 00:00:00", "capitule") testExtract_fr("rédige le contrat 3 jours après jeudi prochain", "2017-07-09 00:00:00", "rédige contrat") testExtract_fr("signe le contrat 2 semaines après jeudi dernier", "2017-07-06 00:00:00", "signe contrat") testExtract_fr("lance le four dans un quart d'heure", "2017-06-27 00:15:00", "lance four") testExtract_fr("enfourne la pizza dans une demi-heure", "2017-06-27 00:30:00", "enfourne pizza") testExtract_fr("arrête le four dans trois quarts d'heure", "2017-06-27 00:45:00", "arrête four") testExtract_fr("mange la pizza dans une heure", "2017-06-27 01:00:00", "mange pizza") testExtract_fr("bois la bière dans 2h23", "2017-06-27 02:23:00", "bois bière") testExtract_fr("faire les plantations le 3ème jour de mars", "2018-03-03 00:00:00", "faire plantations") testExtract_fr("récolter dans 10 mois", "2018-04-27 00:00:00", "récolter") testExtract_fr("point 6a: dans 10 mois", "2018-04-27 06:00:00", "point") testExtract_fr("l'après-midi démissionner à 4:59", "2017-06-27 16:59:00", "démissionner") testExtract_fr("cette nuit dormir", "2017-06-27 02:00:00", "dormir") testExtract_fr("ranger son bureau à 1700 heures", "2017-06-27 17:00:00", "ranger son bureau") testExtractDate2_fr("range le contrat 2 semaines après lundi", "2017-07-17 00:00:00", "range contrat") testExtractDate2_fr("achète-toi de l'humour à 15h", "2017-07-01 15:00:00", "achète-toi humour") # Disabling test until French Extract-date incorporates the fixes for # UTC / Local timezones. Until then this test fails periodically # whenever test is run and the date in the local timezone (where the # test is being run) is a different than the date in UTC. # # testExtractNoDate_fr("tais-toi aujourd'hui", # datetime.now().strftime("%Y-%m-%d") + " 00:00:00", # "tais-toi") self.assertEqual(extract_datetime("", lang="fr-fr"), None) self.assertEqual(extract_datetime("phrase inutile", lang="fr-fr"), None) self.assertEqual(extract_datetime( "apprendre à compter à 37 heures", lang="fr-fr"), None) def test_extractdatetime_default_fr(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime("faire les plantations le 3ème jour de mars", anchor, lang='fr-fr', default_time=default) self.assertEqual(default, res[0].time()) def test_extract_duration_fr(self): self.assertEqual(extract_duration("10 secondes", lang="fr-fr"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 minutes", lang="fr-fr"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 heures", lang="fr-fr"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 jours", lang="fr-fr"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 semaines", lang="fr-fr"), (timedelta(weeks=25), "")) # No conversion for work to number yet for fr self.assertEqual(extract_duration("sept heures"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 secondes", lang="fr-fr"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("5 jours et vingt-neuf secondes"), (timedelta(days=5, seconds=29), "et")) # Fraction not yet implemented #self.assertEqual(extract_duration("huit jours et demi et trente-neuf secondes"), # (timedelta(days=8.5, seconds=39), "et ")) self.assertEqual(extract_duration("démarre un minuteur pour 30 minutes", lang="fr-fr"), (timedelta(minutes=30), "démarre 1 minuteur pour")) #self.assertEqual(extract_duration("Quatre minutes et demi avant le coucher du soleil"), # (timedelta(minutes=4.5), "avant le coucher du soleil")) self.assertEqual(extract_duration("Une heure dix-neuf minutes"), (timedelta(hours=1, minutes=19), "")) self.assertEqual(extract_duration("réveille moi dans 3 semaines, " " 497 jours et" " 391.6 secondes", lang="fr-fr"), (timedelta(weeks=3, days=497, seconds=391.6), "réveille moi dans et")) self.assertEqual(extract_duration("Le film dure une heure, cinquante-sept minutes"), (timedelta(hours=1, minutes=57), "film dure")) self.assertEqual(extract_duration("10-secondes", lang="fr-fr"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-minutes", lang="fr-fr"), (timedelta(minutes=5), "")) def test_spaces_fr(self): self.assertEqual(normalize(" c'est le test", lang="fr-fr"), "c'est test") self.assertEqual(normalize(" c'est le test ", lang="fr-fr"), "c'est test") self.assertEqual(normalize(" c'est un test", lang="fr-fr"), "c'est 1 test") def test_numbers_fr(self): self.assertEqual(normalize("c'est un deux trois test", lang="fr-fr"), "c'est 1 2 3 test") self.assertEqual(normalize(" c'est le quatre cinq six test", lang="fr-fr"), "c'est 4 5 6 test") self.assertEqual(normalize("c'est le sept huit neuf test", lang="fr-fr"), "c'est 7 8 9 test") self.assertEqual(normalize("c'est le sept huit neuf test", lang="fr-fr"), "c'est 7 8 9 test") self.assertEqual(normalize("voilà le test dix onze douze", lang="fr-fr"), "voilà test 10 11 12") self.assertEqual(normalize("voilà le treize quatorze test", lang="fr-fr"), "voilà 13 14 test") self.assertEqual(normalize("ça fait quinze seize dix-sept", lang="fr-fr"), "ça fait 15 16 17") self.assertEqual(normalize("ça fait dix-huit dix-neuf vingt", lang="fr-fr"), "ça fait 18 19 20") self.assertEqual(normalize("ça fait mille cinq cents", lang="fr-fr"), "ça fait 1500") self.assertEqual(normalize("voilà cinq cents trente et un mille euros", lang="fr-fr"), "voilà 531000 euros") self.assertEqual(normalize("voilà trois cents soixante mille cinq" " cents quatre-vingt-dix-huit euros", lang="fr-fr"), "voilà 360598 euros") self.assertEqual(normalize("voilà vingt et un euros", lang="fr-fr"), "voilà 21 euros") self.assertEqual(normalize("joli zéro sur vingt", lang="fr-fr"), "joli 0 sur 20") self.assertEqual(normalize("je veux du quatre-quart", lang="fr-fr"), "je veux quatre-quart") self.assertEqual(normalize("pour la neuf centième fois", lang="fr-fr"), "pour 900e fois") self.assertEqual(normalize("pour la première fois", lang="fr-fr"), "pour 1er fois") self.assertEqual(normalize("le neuf cents quatre-vingt-dix" " millième épisode", lang="fr-fr"), "990000e épisode") self.assertEqual(normalize("la septième clé", lang="fr-fr"), "7e clé") self.assertEqual(normalize("la neuvième porte", lang="fr-fr"), "9e porte") self.assertEqual(normalize("le cinquième jour", lang="fr-fr"), "5e jour") self.assertEqual(normalize("le trois-cents-soixante-cinquième jour", lang="fr-fr"), "365e jour") self.assertEqual(normalize("la 1ère fois", lang="fr-fr"), "1er fois") self.assertEqual(normalize("le centième centime", lang="fr-fr"), "100e centime") self.assertEqual(normalize("le millième millésime", lang="fr-fr"), "1000e millésime") self.assertEqual(normalize("le trentième anniversaire", lang="fr-fr"), "30e anniversaire") # TODO function not localized def test_gender_fr(self): # self.assertEqual(get_gender("personne", lang="fr-fr"), # None) self.assertRaises(FunctionNotLocalizedError, get_gender, "personne", lang="fr-fr") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_it.py000066400000000000000000001364311426211343400222370ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import get_gender from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language('it-it') set_default_lang('it') def tearDownModule(): unload_language('it') class TestNormalize(unittest.TestCase): """ Test cases for Italian parsing """ def test_articles_it(self): """ Test cases for Italian remove_articles """ self.assertEqual(normalize('questo è il test', lang='it', remove_articles=True), 'questo è test') self.assertEqual(normalize('questa è la frase', lang='it', remove_articles=True), 'questa è frase') self.assertEqual(normalize('questo è lo scopo', lang='it', remove_articles=True), 'questo è scopo') self.assertEqual(normalize('questo è il test extra', lang='it', remove_articles=False), 'questo è il test extra') def test_extractnumber_it(self): """ Test cases for Italian extract_number, lang='it' """ self.assertEqual(extract_number('questo è il primo test', lang='it'), 1) self.assertEqual(extract_number('questo è il 2 test', lang='it'), 2) self.assertEqual(extract_number('questo è il secondo test', lang='it', ordinals=True), 2) self.assertEqual(extract_number('questo è un terzo di test', lang='it'), 1.0 / 3.0) self.assertEqual(extract_number('questo è il terzo test', lang='it', ordinals=True), 3.0) self.assertEqual(extract_number('questo è il trentaseiesimo test', lang='it'), 36.0) self.assertEqual(extract_number('questo è il test numero 4', lang='it'), 4) self.assertEqual(extract_number('una tazza', lang='it'), 1) self.assertEqual(extract_number('un gatto', lang='it'), 1) self.assertEqual(extract_number('un terzo di tazza', lang='it'), 1.0 / 3.0) self.assertEqual(extract_number('2 quinti di tazza', lang='it'), 0.4) self.assertEqual(extract_number('due quinti di tazza', lang='it'), 0.4) self.assertEqual(extract_number('tre tazze', lang='it'), 3) self.assertEqual(extract_number('1/3 tazze', lang='it'), 1.0 / 3.0) self.assertEqual(extract_number('un quarto di tazza', lang='it'), 0.25) self.assertEqual(extract_number('1/4 tazza', lang='it'), 0.25) self.assertEqual(extract_number('2/3 tazza', lang='it'), 2.0 / 3.0) self.assertEqual(extract_number('3/4 tazza', lang='it'), 3.0 / 4.0) self.assertEqual(extract_number('1 e 1/4 tazza', lang='it'), 1.25) self.assertEqual(extract_number('1 tazza e mezzo', lang='it'), 1.5) self.assertEqual(extract_number('una tazza e mezzo', lang='it'), 1.5) self.assertEqual(extract_number('una e mezza tazza', lang='it'), 1.5) self.assertEqual(extract_number('una e una mezza tazza', lang='it'), 1.5) self.assertEqual(extract_number('tre quarti di tazza', lang='it'), 3.0 / 4.0) self.assertEqual(extract_number('ventidue', lang='it'), 22) self.assertEqual(extract_number('duecento', lang='it'), 200) self.assertEqual(extract_number('novemila', lang='it'), 9000) self.assertEqual(extract_number('duemilioni', lang='it', short_scale=False), 2000000) self.assertEqual(extract_number('duemilionicinquecentomila ' 'tonnellate di metallo', lang='it'), 2500000) self.assertEqual(extract_number('duemilioni cinquecentomila ' 'tonnellate di metallo', lang='it'), 2500000) self.assertEqual(extract_number('sei trilioni', lang='it'), 6000000000000000000.0) self.assertEqual(extract_number('sei trilioni', short_scale=True, lang='it'), 6e+18) self.assertEqual(extract_number('unmiliardounmilione', lang='it', short_scale=False), 1001000000) self.assertEqual(extract_number('unmiliardocento', lang='it', short_scale=False), 1000000100) self.assertEqual(extract_number('duemiliardiunmilionecentotrentadue', lang='it'), 2001000132) self.assertEqual(extract_number('venti diciassettesimi', lang='it'), 20.0/17.0) self.assertEqual(extract_number('uno punto cinque', lang='it'), 1.5) self.assertEqual(extract_number('tre punto quattordici', lang='it'), 3.14) self.assertEqual(extract_number('zero punto due', lang='it'), 0.2) self.assertEqual(extract_number('vecchio miliardi di anni', lang='it'), 1000000000.0) self.assertEqual(extract_number('vecchio trilioni di anni', short_scale=False, lang='it'), 1000000000000000000.0) self.assertEqual(extract_number('centomila', lang='it'), 100000) self.assertEqual(extract_number('millequattrocentonovantadue', lang='it'), 1492) self.assertEqual(extract_number('meno 2', lang='it'), -2) self.assertEqual(extract_number('meno settanta', lang='it'), -70) self.assertEqual(extract_number('mille milioni', lang='it'), 1000000000) self.assertEqual(extract_number('millecentouno', lang='it'), 1101) self.assertEqual(extract_number('un sesto terzo', lang='it'), 1 / 6 / 3) self.assertEqual(extract_number('trenta secondi', lang='it'), 30) self.assertEqual(extract_number('trenta secondi', lang='it', ordinals=True), 30) self.assertEqual(extract_number('sette e qualcosa', lang='it'), 7.0) self.assertEqual(extract_number('sette punto 5', lang='it'), 7.5) self.assertEqual(extract_number('sette punto 575', lang='it'), 7.575) self.assertEqual(extract_number('sette e mezzo', lang='it'), 7.5) self.assertEqual(extract_number('sette e ottanta', lang='it'), 7.80) self.assertEqual(extract_number('sette e otto', lang='it'), 7.8) self.assertEqual(extract_number('sette e zero otto', lang='it'), 7.08) self.assertEqual(extract_number('sette e zero zero zero otto gradi', lang='it'), 7.0008) self.assertEqual(extract_number('venti tredicesimi', lang='it'), 20.0 / 13.0) self.assertEqual(extract_number('venti tredicesimi', lang='it', short_scale=True), 20.0 / 13.0) self.assertEqual(extract_number('sei virgola sessantasei', lang='it'), 6.66) self.assertEqual(extract_number('sei virgola sessantasei', lang='it'), 6.66) self.assertEqual(extract_number('seicentosessantasei', lang='it'), 666) self.assertEqual(extract_number('seicento sessantasei', lang='it'), 666) self.assertEqual(extract_number('mille quattrocento novantadue', lang='it'), 1492) self.assertEqual(extract_number('millequattrocentonovantadue', lang='it'), 1492) self.assertEqual(extract_number('sei cento punto zero sei', lang='it'), 600.06) self.assertEqual(extract_number('seicento punto zero zero sei', lang='it'), 600.006) self.assertEqual(extract_number('seicento punto zero zero zero sei', lang='it'), 600.0006) self.assertEqual(extract_number('tre decimi ', lang='it'), 0.30000000000000004) self.assertEqual(extract_number('dodici centesimi', lang='it'), 0.12) # self.assertEqual(extract_number('cinque e quarantadue millesimi', # lang='it'), 5.042) self.assertEqual(extract_number('milleuno', lang='it'), 1001) self.assertEqual(extract_number('due mila ventidue dollari ', lang='it'), 2022) self.assertEqual(extract_number( 'centoquattordicimilaquattrocentoundici dollari ', lang='it', ordinals=True, short_scale=True), 114411) self.assertEqual(extract_number('ventitre dollari ', lang='it'), 23) self.assertEqual(extract_number('ventuno anni ', lang='it'), 21) self.assertEqual(extract_number('dodici e quarantacinque ', lang='it'), 12.45) self.assertEqual(extract_number('avvisa se qualcuno arriva ', lang='it'), False) self.assertTrue(extract_number('Il giocatore di tennis è veloce', lang='it') is False) self.assertTrue(extract_number('nessuno', lang='it') is False) self.assertTrue(extract_number('fraggle zero', lang='it') is not False) self.assertEqual(extract_number('fraggle zero', lang='it'), 0) self.assertTrue(extract_number('grobo 0', lang='it') is not False) self.assertEqual(extract_number('grobo 0', lang='it'), 0) self.assertEqual(extract_number('un paio di birre', lang='it'), 2) self.assertEqual(extract_number('un centinaio di birre', lang='it'), 100) self.assertEqual(extract_number('un paio di migliaia di birre', lang='it'), 2000) self.assertEqual(extract_number('una decina di monete', lang='it'), 10) self.assertEqual(extract_number('tre dozzine di uova', lang='it'), 36) self.assertEqual(extract_number('zero gatti', lang='it'), 0) self.assertEqual(extract_number('Zero gatti', lang='it'), 0) def test_extractdatetime_it_not_normalized(self): """ Test cases for Italian datetime parsing """ def extractWithFormat_it(text): date = datetime(2018, 1, 13, 13, 4, tzinfo=default_timezone()) # Sab 13 Gen, 2018 @ 13:04 [extractedDate, leftover] = extract_datetime(text, date, lang='it-it') extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') return [extractedDate, leftover] def testExtract_it(text, expected_date, expected_leftover): res = extractWithFormat_it(normalize(text)) # era normalize(text) self.assertEqual(res[0], expected_date, 'per=' + text) self.assertEqual(res[1], expected_leftover, 'per=' + text) testExtract_it('che ore sono adesso', '2018-01-13 13:04:00', 'che ora sono') testExtract_it('tra due secondi', '2018-01-13 13:04:02', '') testExtract_it('fra un minuto', '2018-01-13 13:05:00', '') testExtract_it('tra un paio di minuti', '2018-01-13 13:06:00', '') testExtract_it('tra un paio di ore', '2018-01-13 15:04:00', '') testExtract_it('tra due settimane', '2018-01-27 00:00:00', '') testExtract_it('fra un paio di mesi', '2018-03-13 00:00:00', '') testExtract_it('tra un paio di anni', '2020-01-13 00:00:00', '') testExtract_it('tra un decennio', '2028-01-13 00:00:00', '') testExtract_it('fra un paio di decenni', '2038-01-13 00:00:00', '') testExtract_it('nel prossimo decennio', '2028-01-13 00:00:00', '') testExtract_it('nel decennio prossimo', '2028-01-13 00:00:00', '') testExtract_it('nello scorso decennio', '2008-01-13 00:00:00', '') testExtract_it('nel decennio passato', '2008-01-13 00:00:00', '') testExtract_it('tra un secolo', '2118-01-13 00:00:00', '') testExtract_it('fra un millennio', '3018-01-13 00:00:00', '') testExtract_it('tra un paio di decenni', '2038-01-13 00:00:00', '') testExtract_it('tra 5 decenni', '2068-01-13 00:00:00', '') testExtract_it('fra un paio di secoli', '2218-01-13 00:00:00', '') testExtract_it('tra 2 secoli', '2218-01-13 00:00:00', '') testExtract_it('fra un paio di millenni', '4018-01-13 00:00:00', '') testExtract_it('appuntamento tra un ora', '2018-01-13 14:04:00', 'appuntamento') # testExtract_it('lo voglio entro l\'ora', # '2018-01-13 14:04:00', 'lo voglio entro') # TODO: MycroftAI/#125 # testExtract_it('in 1 secondo', # '2018-01-13 13:04:01', '') testExtract_it('tra 2 secondi', '2018-01-13 13:04:02', '') testExtract_it('Imposta l\'imboscata tra 1 minuto', '2018-01-13 13:05:00', 'imposta imboscata') testExtract_it('Imposta l\'imboscata tra mezzora', '2018-01-13 13:34:00', 'imposta imboscata') testExtract_it('imposta l\'imboscata tra 5 giorni da oggi', '2018-01-18 00:00:00', 'imposta imboscata') testExtract_it('quali sono previsioni meteo di dopo domani?', '2018-01-15 00:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono previsioni meteo dopo il prossimo giovedi?', '2018-01-18 00:00:00', 'quali sono previsioni meteo') testExtract_it('quali erano previsioni meteo dopo lo scorso giovedi?', '2018-01-11 00:00:00', 'quali erano previsioni ' 'meteo dopo') testExtract_it('quali sono previsioni meteo dopo giovedi prossimo?', '2018-01-25 00:00:00', 'quali sono previsioni meteo') testExtract_it('quali erano previsioni meteo dopo giovedi scorso?', '2018-01-11 00:00:00', 'quali erano previsioni meteo') testExtract_it('quali erano previsioni meteo da adesso?', '2018-01-13 00:00:00', 'quali erano previsioni meteo') testExtract_it('ricordami alle 10:45 pm', '2018-01-13 22:45:00', 'ricordami') testExtract_it('quale è il meteo di venerdì mattina', '2018-01-19 08:00:00', 'quale meteo') testExtract_it('quale è il meteo di domani', '2018-01-14 00:00:00', 'quale meteo') testExtract_it('quali sono le previsioni meteo di oggi pomeriggio', '2018-01-13 15:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di oggi pomeriggio ' 'presto', '2018-01-13 14:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di questa sera', '2018-01-13 19:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di questa sera tardi', '2018-01-13 20:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di mezzogiorno', '2018-01-14 12:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di mezzanotte', '2018-01-14 00:00:00', 'quali sono previsioni meteo') # TODO MycroftAI/#125 # testExtract_it('quali sono le previsioni meteo di mezzo giorno', # '2018-01-14 12:00:00', 'quali sono previsioni meteo') # testExtract_it('quali sono le previsioni meteo di mezza notte', # '2018-01-14 00:00:00', 'quali sono previsioni meteo') testExtract_it('quali sono le previsioni meteo di questa mattina', '2018-01-14 08:00:00', 'quali sono previsioni meteo') testExtract_it('ricordami di chiamare mamma il 3 agosto', '2018-08-03 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami domani di chiamare mamma alle 7 del mattino', '2018-01-14 07:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7 di sera', '2018-01-13 19:00:00', 'ricordami chiamare mamma') testExtract_it('chiamare mamma tra un ora', '2018-01-13 14:04:00', 'chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 0600', '2018-01-14 06:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 09 e 30', '2018-01-13 21:30:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7 in punto', '2018-01-13 19:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma questa sera alle 7 ' 'in punto', '2018-01-13 19:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7 questa sera', '2018-01-13 19:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma domani alle 7 in punto' ' del mattino', '2018-01-14 07:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma giovedi sera ' 'alle 7 in punto', '2018-01-18 19:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma giovedi ' 'mattina alle 7 in punto', '2018-01-18 07:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7 ' 'in punto di giovedi mattina', '2018-01-18 07:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7:00 ' 'di giovedi mattina', '2018-01-18 07:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 7:00 di giovedi sera', '2018-01-18 19:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 11:00 di ' 'giovedi sera', '2018-01-18 23:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 2:00 di giovedi ' 'notte', '2018-01-18 02:00:00', 'ricordami chiamare mamma notte') testExtract_it('ricordami di chiamare mamma alle 2:00 di giovedi ' 'pomeriggio', '2018-01-18 14:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma mercoledì sera alle 8', '2018-01-17 20:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra due ore', '2018-01-13 15:04:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra quindici minuti', '2018-01-13 13:19:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra mezzora', '2018-01-13 13:34:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra un quarto di ora', '2018-01-13 13:19:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra tre quarti di ora', '2018-01-13 13:49:00', 'ricordami chiamare mamma') testExtract_it('Play Rick Astley music 2 giorni da venerdì', '2018-01-21 00:00:00', 'play rick astley music') testExtract_it('Iniziare l\'invasione alle 3:45 pm di giovedì', '2018-01-18 15:45:00', 'iniziare invasione') testExtract_it('di lunedì, ordinare la torta pasticceria', '2018-01-15 00:00:00', 'ordinare torta pasticceria') testExtract_it('Play Happy Birthday music 5 anni da oggi', '2023-01-13 00:00:00', 'play happy birthday music') testExtract_it('comprare fuochi d\'artificio il 4 di luglio', '2018-07-04 00:00:00', 'comprare fuochi d\'artificio') testExtract_it('quale è il meteo 2 settimane dopo il prossimo venerdì', '2018-02-02 00:00:00', 'quale meteo') testExtract_it('quale è il meteo mercoledì alle ore 0700 ', '2018-01-17 07:00:00', 'quale meteo') testExtract_it('Fissa la visita tra 2 settimane e 6 giorni da sabato', '2018-02-02 00:00:00', 'fissa visita') testExtract_it('iniziare l\'invasione giovedì alle 03 45', '2018-01-18 03:45:00', 'iniziare invasione') testExtract_it('iniziare l\'invasione alle 800 di giovedì', '2018-01-18 08:00:00', 'iniziare invasione') testExtract_it('iniziare la festa alle 8 in punto della sera' ' di giovedi', '2018-01-18 20:00:00', 'iniziare festa') testExtract_it('iniziare l\'invasione alle 8 della sera di giovedì', '2018-01-18 20:00:00', 'iniziare invasione') testExtract_it('iniziare l\'invasione di giovedi a mezzogiorno', '2018-01-18 12:00:00', 'iniziare invasione') testExtract_it('iniziare l\'invasione di giovedi a mezzanotte', '2018-01-19 00:00:00', 'iniziare invasione') testExtract_it('iniziare l\'invasione di giovedi alle 0500', '2018-01-18 05:00:00', 'iniziare invasione') testExtract_it('remind me to wake up tra 4 anni', '2022-01-13 00:00:00', 'remind me to wake up') testExtract_it('remind me to wake up tra 4 anni e 4 giorni', '2022-01-17 00:00:00', 'remind me to wake up') testExtract_it('quali le previsioni meteo 3 giorni dopo domani?', '2018-01-17 00:00:00', 'quali previsioni meteo') testExtract_it('il dicembre 3', '2018-12-03 00:00:00', '') testExtract_it('nel 3 dicembre', '2018-12-03 00:00:00', '') testExtract_it('il dic 3 2019', '2019-12-03 00:00:00', '') testExtract_it('il 3 feb 2019', '2019-02-03 00:00:00', '') testExtract_it('incontriamoci alle 8:00 questa sera', '2018-01-13 20:00:00', 'incontriamoci') testExtract_it('incontriamoci alle 5 pm', '2018-01-13 17:00:00', 'incontriamoci') testExtract_it('incontriamoci alle 8 a.m.', '2018-01-14 08:00:00', 'incontriamoci') testExtract_it('ricordami di svegliarmi alle 8 a.m', '2018-01-14 08:00:00', 'ricordami svegliarmi') testExtract_it('come è il tempo di giovedi', '2018-01-18 00:00:00', 'come tempo') testExtract_it('come è il tempo di lunedi', '2018-01-15 00:00:00', 'come tempo') testExtract_it('quale è il tempo di questo mercoledì', '2018-01-17 00:00:00', 'quale tempo') testExtract_it('per giovedi quale è il meteo', '2018-01-18 00:00:00', 'quale meteo') testExtract_it('questo giovedi quale è il meteo', '2018-01-18 00:00:00', 'quale meteo') testExtract_it('lo scorso lunedi quale era il meteo', '2018-01-08 00:00:00', 'quale meteo') testExtract_it('imposta un avviso per mercoledi sera alle 8', '2018-01-17 20:00:00', 'imposta avviso') testExtract_it('imposta un avviso per mercoledi alle 3 in punto' ' del pomeriggio', '2018-01-17 15:00:00', 'imposta avviso') testExtract_it('imposta un avviso per mercoledi alle 3 in punto del' ' mattino', '2018-01-17 03:00:00', 'imposta avviso') # TODO MycroftAI/#125 # testExtract_it('imposta una sveglia per mercoledi mattina alle' # ' 7 in punto', # '2018-01-17 07:00:00', 'imposta una sveglia') # testExtract_it('imposta una sveglia per oggi alle 7 in punto', # '2018-01-13 19:00:00', 'imposta una sveglia') # testExtract_it('imposta una sveglia per questa sera alle 7 in punto', # '2018-01-13 19:00:00', 'imposta sveglia') # testExtract_it('imposta una sveglia per questa sera alle 07:00', # '2018-01-13 19:00:00', 'imposta una sveglia') testExtract_it('nella sera del 5 giugno 2017 ricordami di' + ' chiamare mia madre', '2017-06-05 19:00:00', 'ricordami chiamare mia madre') # TODO MycroftAI/#125 # testExtract_it('aggiorna il mio calendario per un meeting al mattino' + # ' con Giulio il 4 marzo', # '2018-03-04 08:00:00', # 'aggiorna mio calendario meeting con giulio') testExtract_it('quale giorno è oggi', '2018-01-13 00:00:00', 'quale giorno') testExtract_it('che giorno è domani', '2018-01-14 00:00:00', 'che giorno') testExtract_it('che giorno era ieri', '2018-01-12 00:00:00', 'che giorno') testExtract_it('che giorno è dopo domani', '2018-01-15 00:00:00', 'che giorno') testExtract_it('fissare la cena tra 5 giorni', '2018-01-18 00:00:00', 'fissare cena') testExtract_it('Come è il tempo per dopodomani', '2018-01-15 00:00:00', 'come tempo') testExtract_it('ricordami alle 22:45', '2018-01-13 22:45:00', 'ricordami') testExtract_it('Come è il tempo venerdì mattina', '2018-01-19 08:00:00', 'come tempo') testExtract_it('ricordami di chiamare mamma giovedi prossimo', '2018-01-25 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 3 settimane', '2018-02-03 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 8 settimane', '2018-03-10 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 8 settimane' ' e 2 giorni', '2018-03-12 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 4 giorni', '2018-01-17 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 3 mesi', '2018-04-13 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 2 anni e 2 giorni', '2020-01-15 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma la prossima settimana', '2018-01-20 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma la settimana prossima', '2018-01-20 00:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di controllare spese della settimana scorsa', '2018-01-06 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese della scorsa settimana', '2018-01-06 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese del mese scorso', '2017-12-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese dello scorso mese', '2017-12-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese del mese prossimo', '2018-02-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese dello prossimo mese', '2018-02-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese dell anno scorso', '2017-01-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese dello scorso anno', '2017-01-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese del anno prossimo', '2019-01-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di controllare spese dello prossimo anno', '2019-01-13 00:00:00', 'ricordami controllare spese') testExtract_it('ricordami di telefonare giovedì prossimo', '2018-01-25 00:00:00', 'ricordami telefonare') testExtract_it('ricordami di telefonare il prossimo giovedì', '2018-01-25 00:00:00', 'ricordami telefonare') testExtract_it('ricordami di controllare spese di giovedi scorso', '2018-01-11 00:00:00', 'ricordami controllare spese') testExtract_it('Gioca a briscola 2 giorni dopo venerdì', '2018-01-21 00:00:00', 'gioca briscola') testExtract_it('Inizia le pulizie alle 15:45 di giovedì', '2018-01-18 15:45:00', 'inizia pulizie') testExtract_it('lunedi compra formaggio', '2018-01-15 00:00:00', 'compra formaggio') testExtract_it('suona musica compleanno tra 5 anni da oggi', '2023-01-13 00:00:00', 'suona musica compleanno') testExtract_it('Invia Skype alla mamma alle 12:45 di giovedì' ' prossimo.', '2018-01-25 12:45:00', 'invia skype mamma') testExtract_it('Come è il tempo questo venerdì?', '2018-01-19 00:00:00', 'come tempo') testExtract_it('Come è il tempo questo venerdì pomeriggio?', '2018-01-19 15:00:00', 'come tempo') # TODO MycroftAI/#125 # testExtract_it('Come è il tempo questo venerdì a mezza notte?', # '2018-01-20 00:00:00', 'come tempo') testExtract_it('Come è il tempo questo venerdì a mezzogiorno?', '2018-01-19 12:00:00', 'come tempo') testExtract_it('Ricordami di chiamare mia madre il 3 agosto.', '2018-08-03 00:00:00', 'ricordami chiamare mia madre') testExtract_it('compra le candele il 1° maggio', '2018-05-01 00:00:00', 'compra candele') testExtract_it('Come è il tempo 1 giorno dopo domani?', '2018-01-15 00:00:00', 'come tempo') testExtract_it('Come è il tempo alle ore 7?', '2018-01-13 19:00:00', 'come tempo') testExtract_it('Come è il tempo domani alle 7 in punto?', '2018-01-14 07:00:00', 'come tempo') testExtract_it('Come è il tempo domani alle 2 del pomeriggio', '2018-01-14 14:00:00', 'come tempo') testExtract_it('Come è il tempo domani pomeriggio alle 2', '2018-01-14 14:00:00', 'come tempo') testExtract_it('Come è il tempo domani per le 2:00', '2018-01-14 02:00:00', 'come tempo') testExtract_it('Come è il tempo alle 2 del pomeriggio di ' 'venerdì prossimo?', '2018-01-26 14:00:00', 'come tempo') testExtract_it('Ricordami di svegliarmi tra 4 anni', '2022-01-13 00:00:00', 'ricordami svegliarmi') testExtract_it('Ricordami di svegliarmi tra 4 anni e 4 giorni', '2022-01-17 00:00:00', 'ricordami svegliarmi') testExtract_it('Dormi 3 giorni da domani.', '2018-01-17 00:00:00', 'dormi') testExtract_it('segna appuntamento tra 2 settimane e 6 giorni ' 'dopo sabato', '2018-02-02 00:00:00', 'segna appuntamento') testExtract_it('La festa inizia alle 8 di sera di giovedì', '2018-01-18 20:00:00', 'festa inizia') testExtract_it('Come è il meteo 3 tra giorni?', '2018-01-16 00:00:00', 'come meteo') testExtract_it('fissa appuntamento dicembre 3', '2018-12-03 00:00:00', 'fissa appuntamento') testExtract_it('incontriamoci questa sera alle 8 ', '2018-01-13 20:00:00', 'incontriamoci') testExtract_it('incontriamoci alle 8 questa sera', '2018-01-13 20:00:00', 'incontriamoci') testExtract_it('impostare sveglia questa sera alle 9 ', '2018-01-13 21:00:00', 'impostare sveglia') testExtract_it('impostare sveglia questa sera alle 21 ', '2018-01-13 21:00:00', 'impostare sveglia') testExtract_it('inserire appuntamento domani sera alle 23', '2018-01-14 23:00:00', 'inserire appuntamento') # TODO MycroftAI/#125 # testExtract_it('inserire appuntamento domani alle 9 e mezza', # '2018-01-14 09:30:00', 'inserire appuntamento') testExtract_it('inserire appuntamento domani sera alle 23 e 3 quarti', '2018-01-14 23:45:00', 'inserire appuntamento') testExtract_it('inserire appuntamento domani sera alle 23 e 5 quarti', '2018-01-14 23:00:00', 'inserire appuntamento') def test_extractdatetime_it_normalized(self): """ Test cases for Italian datetime parsing """ def extractWithFormat_it(text): date = datetime(2018, 1, 13, 13, 4, tzinfo=default_timezone()) # Sab 13 Gen, 2018 @ 13:04 [extractedDate, leftover] = extract_datetime(text, date, lang='it-it') extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') return [extractedDate, leftover] def testExtract_it(text, expected_date, expected_leftover): res = extractWithFormat_it(normalize(text, lang='it-it')) self.assertEqual(res[0], expected_date, 'per=' + text) self.assertEqual(res[1], expected_leftover, 'per=' + text) testExtract_it('ricordami di chiamare mamma tra 15 minuti', '2018-01-13 13:19:00', 'ricordami chiamare mamma') testExtract_it('chiamare mamma alle 17 e 30', '2018-01-13 17:30:00', 'chiamare mamma') testExtract_it('ricordami di chiamare mamma tra 15 minuti', '2018-01-13 13:19:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma sabato alle 10 ' + 'del mattino', '2018-01-13 10:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 10 del mattino di' ' questo sabato', '2018-01-13 10:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 10 del mattino di' ' sabato prossimo', '2018-01-20 10:00:00', 'ricordami chiamare mamma') testExtract_it('ricordami di chiamare mamma alle 10 del mattino del' ' prossimo sabato', '2018-01-20 10:00:00', 'ricordami chiamare mamma') testExtract_it('Come è il tempo questo venerdì alle 11 del mattino?', '2018-01-19 11:00:00', 'come tempo') testExtract_it('comprare fragole il 13 maggio', '2018-05-13 00:00:00', 'comprare fragole') testExtract_it('inserire appuntamento domani sera alle 23 e' + ' tre quarti', '2018-01-14 23:45:00', 'inserire appuntamento') def test_extract_ambiguous_time_it(self): mattina = datetime(2017, 6, 27, 8, 1, 2, tzinfo=default_timezone()) sera = datetime(2017, 6, 27, 20, 1, 2, tzinfo=default_timezone()) mezzogiorno = datetime(2017, 6, 27, 12, 1, 2, tzinfo=default_timezone()) self.assertEqual( extract_datetime('dai da mangiare ai pesci alle 10 in punto', anchorDate=mattina, lang='it-it')[0], datetime(2017, 6, 27, 10, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('dai da mangiare ai pesci alle 10 in punto', mezzogiorno, lang='it-it')[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('dai da mangiare ai pesci alle 10 in punto', sera, lang='it-it')[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) def test_extract_relativedatetime_it(self): """ Test cases for relative datetime """ def extractWithFormat(text): date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang='it-it') extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') return [extractedDate, leftover] def testExtract_it(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, 'per =' + text) self.assertEqual(res[1], expected_leftover, 'per =' + text) testExtract_it('incontriamoci tra 5 minuti', '2017-06-27 10:06:02', 'incontriamoci') testExtract_it('incontriamoci tra 5 secondi', '2017-06-27 10:01:07', 'incontriamoci') testExtract_it('incontriamoci tra 1 ora', '2017-06-27 11:01:02', 'incontriamoci') testExtract_it('incontriamoci tra 2 ore', '2017-06-27 12:01:02', 'incontriamoci') testExtract_it('incontriamoci tra 1 minuto', '2017-06-27 10:02:02', 'incontriamoci') # TODO MycroftAI/#125 # testExtract_it('incontriamoci tra 1 secondo', # '2017-06-27 10:01:03', 'incontriamoci') def test_spaces_it(self): """ Test cases for Italian remove spaces """ self.assertEqual(normalize('questo è un test ', lang='it'), 'questo è 1 test') self.assertEqual(normalize('un altro test ', lang='it'), '1 altro test') self.assertEqual(normalize('questa è un\' altra amica ', lang='it', remove_articles=False), 'questa è 1 altra amica') self.assertEqual(normalize('questo è un test ', lang='it', remove_articles=False), 'questo è 1 test') def test_numbers_it(self): """ Test cases for Italian normalize lang='it' """ self.assertEqual(normalize('è un test sette otto nove', lang='it'), 'è 1 test 7 8 9') self.assertEqual(normalize('test zero dieci undici dodici tredici', lang='it'), 'test 0 10 11 12 13') self.assertEqual(normalize('test mille seicento sessanta e sei', lang='it', remove_articles=False), 'test 1000 600 60 e 6') self.assertEqual(normalize('test sette e mezzo', lang='it', remove_articles=False), 'test 7 e 0.5') self.assertEqual(normalize('test due punto nove', lang='it'), 'test 2 punto 9') self.assertEqual(normalize('test cento e nove', lang='it', remove_articles=False), 'test 100 e 9') self.assertEqual(normalize('test venti e 1', lang='it'), 'test 20 e 1') self.assertEqual(normalize('test ventuno e ventisette', lang='it'), 'test 21 e 27') def test_multiple_numbers_it(self): self.assertEqual(extract_numbers('questo è il test uno due tre', lang='it'), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers('questo è il test quattro sette' + ' quattro', lang='it'), [4.0, 7.0, 4.0]) self.assertEqual(extract_numbers('questo è il test cinque sei sette', lang='it'), [5.0, 6.0, 7.0]) self.assertEqual(extract_numbers('questo è test dieci undici dodici', lang='it'), [10.0, 11.0, 12.0]) self.assertEqual(extract_numbers('test dodici gatti ventuno', lang='it'), [21.0, 12.0]) self.assertEqual(extract_numbers('1 cane, sette maiali, macdonald ' + 'aveva la fattoria, 3 volte' + ' 5 macarena', lang='it'), [1, 7, 3, 5]) self.assertEqual(extract_numbers('due birre per due orsi', lang='it'), [2.0, 2.0]) self.assertEqual(extract_numbers('venti quaranta trenta', lang='it'), [20, 40, 30]) self.assertEqual(extract_numbers('venti 20 22', lang='it'), [20, 20, 22]) self.assertEqual(extract_numbers('ventidue matti venti ratti ' 'venti gatti', lang='it'), [22, 20, 20]) self.assertEqual(extract_numbers('venti 20 venti 2', lang='it'), [20, 20, 20, 2]) self.assertEqual(extract_numbers('un terzo uno', lang='it'), [1 / 3, 1]) # self.assertEqual(extract_numbers('un terzo uno', # lang='it', ordinals=True), [3]) self.assertEqual(extract_numbers('sei miliardi', lang='it', short_scale=True), [6e9]) self.assertEqual(extract_numbers('seimilioni', lang='it', short_scale=False), [6e6]) self.assertEqual(extract_numbers('dodici maiali accompagnano \ seimiliardi di batteri', lang='it', short_scale=True), [6e9, 12]) # TODO case when pronounced/extracted number don't match # fractional numbers often fail # self.assertEqual(extract_numbers('questo è un sette otto\ # nove e mezzo test',lang='it'), [7.0, 8.0, 9.5]) # TODO pronounce number should accept short_scale flag # self.assertEqual(extract_numbers('two pigs and six trillion # bacteria', short_scale=False), [2, 6e18]) # TODO pronounce_number should accept ordinals flag # self.assertEqual(extract_numbers('thirty second or first', # ordinals=True), [32, 1]) def test_extractdatetime_default_it(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime('Come è il meteo 3 tra giorni?', anchor, lang='it-it', default_time=default) self.assertEqual(default, res[0].time()) def test_gender_it(self): """ Test cases for Italian grammar , lang='it' """ self.assertEqual(get_gender('mucca', lang='it'), 'f') self.assertEqual(get_gender('cavallo', lang='it'), 'm') self.assertEqual(get_gender('mucche', 'le mucche', lang='it'), 'f') self.assertEqual(get_gender('bue', 'il bue mangia la erba', lang='it'), 'm') self.assertEqual(get_gender('pesce', 'il pesce nuota', lang='it'), 'm') self.assertEqual(get_gender('tigre', lang='it'), 'f') self.assertEqual(get_gender('uomini', 'questi uomini mangiano pasta', lang='it'), 'm') self.assertEqual(get_gender('ponte', 'il ponte', lang='it'), 'm') self.assertEqual(get_gender('ponte', 'questo ponte è caduto', lang='it'), 'm') self.assertEqual(get_gender('scultrice', 'questa scultrice famosa', lang='it'), 'f') self.assertEqual(get_gender('scultore', 'questo scultore famoso', lang='it'), 'm') self.assertEqual(get_gender('scultori', 'gli scultori rinascimentali', lang='it'), 'm') self.assertEqual(get_gender('scultrici', 'le scultrici moderne', lang='it'), 'f') if __name__ == '__main__': unittest.main() lingua-franca-release-v0.4.3/test/test_parse_nl.py000066400000000000000000000237561426211343400222410ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright 2019 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time, timedelta from lingua_franca import load_language, set_default_lang, unload_language from lingua_franca.parse import extract_datetime, extract_number, normalize, extract_duration from lingua_franca.time import default_timezone LANG = "nl-nl" def setUpModule(): load_language('nl-nl') set_default_lang('nl-nl') def tearDownModule(): unload_language('nl') class TestParsing(unittest.TestCase): def test_articles(self): self.assertEqual( normalize("dit is de test", LANG, remove_articles=True), "dit is test") self.assertEqual( normalize("en nog een Test", LANG, remove_articles=True), "en nog 1 Test") self.assertEqual(normalize("dit is de Extra-Test", LANG, remove_articles=False), "dit is de Extra-Test") def test_extract_number(self): self.assertEqual(extract_number("dit is de eerste Test", lang=LANG, ordinals=True), 1) self.assertEqual(extract_number("dit is 2 Test", lang=LANG), 2) self.assertEqual(extract_number("dit is tweede Test", lang=LANG, ordinals=True), 2) self.assertEqual( extract_number("dit is Test drie", lang=LANG), 3) self.assertEqual( extract_number("dit is Test Drie", lang=LANG), 3) self.assertEqual( extract_number("dit is de Test Nummer 4", lang=LANG), 4) self.assertEqual(extract_number("één derde kopje", lang=LANG), 1.0 / 3.0) self.assertEqual(extract_number("drie kopjes", lang=LANG), 3) self.assertEqual(extract_number("1/3 kopje", lang=LANG), 1.0 / 3.0) self.assertEqual(extract_number("een kwart kopje", lang=LANG), 0.25) self.assertEqual(extract_number("1/4 kopje", lang=LANG), 0.25) self.assertEqual(extract_number("kwart kopje", lang=LANG), 0.25) self.assertEqual(extract_number("2/3 kopje", lang=LANG), 2.0 / 3.0) self.assertEqual(extract_number("3/4 kopje", lang=LANG), 3.0 / 4.0) self.assertEqual(extract_number("1 en 3/4 kopje", lang=LANG), 1.75) self.assertEqual(extract_number("1 kopje en een half", lang=LANG), 1.5) self.assertEqual(extract_number("anderhalf kopje", lang=LANG), 1.5) self.assertEqual(extract_number("driekwart kopje", lang=LANG), 3.0 / 4.0) self.assertEqual(extract_number("driekwart kopje", lang=LANG), 3.0 / 4.0) def test_extractdatetime_nl(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, anchorDate=date, lang=LANG) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("zet een alarm voor 1 dag na vandaag", "2017-06-28 00:00:00", "zet een alarm") testExtract("laten we om 8:00 's avonds afspreken", "2017-06-27 20:00:00", "laten we afspreken") testExtract("zet een alarm voor 5 dagen na vandaag", "2017-07-02 00:00:00", "zet een alarm") testExtract("wat voor weer is het overmorgen?", "2017-06-29 00:00:00", "wat voor weer is") testExtract("herinner me om 10:45 's avonds", "2017-06-27 22:45:00", "herinner me") testExtract("Hoe is het weer morgen", "2017-06-28 00:00:00", "hoe is weer") testExtract("3 december", "2017-12-03 00:00:00", "") testExtract("hoe is het weer vandaag", "2017-06-27 00:00:00", "hoe is weer") testExtract("herinner me over 5 jaar aan mijn contract", "2022-06-27 00:00:00", "herinner me aan mijn contract") testExtract("hoe is het weer volgende week vrijdag", "2017-06-30 00:00:00", "hoe is weer") testExtract("herinner me mijn moeder te bellen op 7 september", "2017-09-07 00:00:00", "herinner me mijn moeder te bellen") testExtract("hoe is het weer 3 dagen na vandaag", "2017-06-30 00:00:00", "hoe is weer") testExtract( "herinner me vanavond aan het ophalen van mijn kinderen", "2017-06-27 19:00:00", "herinner me aan ophalen van mijn kinderen") testExtract( "Herinner me mijn moeder te bellen over 8 weken en 2 dagen", "2017-08-24 00:00:00", "herinner me mijn moeder te bellen") testExtract("Speel rick astley 2 dagen na vrijdag", "2017-07-02 00:00:00", "speel rick astley") testExtract("plan een afspraak in de nacht van 3 september", "2017-09-03 00:00:00", "plan een afspraak") testExtract("hoe is het weer morgenavond", "2017-06-28 19:00:00", "hoe is weer") testExtract("hoe is het weer woensdagavond", "2017-06-28 19:00:00", "hoe is weer") testExtract("hoe is het weer dinsdagochtend", "2017-06-27 08:00:00", "hoe is weer") testExtract("plan een afspraak in voor donderdagmiddag", "2017-06-29 15:00:00", "plan een afspraak") testExtract("Wat voor weer wordt het vrijdagochtend", "2017-06-30 08:00:00", "wat voor weer wordt") # TODO these fail altogether # testExtract("laten we vanavond om 8:00 uur afspreken", # "2017-06-27 20:00:00", "laten we afspreken") # testExtract( # "wordt er regen verwacht op maandag om 3 uur 's middags", "", "") # testExtract("plan een afspraak in voor maandagmiddag 4 uur", # "2017-07-03 16:00:00", "plan een afspraak") # testExtract("plan een afspraak om 2 uur 's middags", # "2017-06-27 14:00:00", "plan een afspraak") def test_extractdatetime_default_nl(self): default = time(9, 0, 0) anchor = datetime(2019, 11, 1, 0, 0) res = extract_datetime("laten we afspreken op donderdag", anchor, lang=LANG, default_time=default) self.assertEqual(default, res[0].time()) def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" self.assertEqual(extract_datetime('geen tijd', lang=LANG), None) def test_spaces(self): self.assertEqual(normalize(" dit is een test", LANG), "dit is 1 test") self.assertEqual(normalize(" dit is een test ", LANG), "dit is 1 test") def test_numbers(self): self.assertEqual( normalize("dit is een twee drie test", LANG), "dit is 1 2 3 test") self.assertEqual( normalize("dit is vier vijf zes test", LANG), "dit is 4 5 6 test") self.assertEqual( normalize("dit is zeven acht negen test", LANG), "dit is 7 8 9 test") self.assertEqual( normalize("dit is zeven acht negen test", LANG), "dit is 7 8 9 test") self.assertEqual( normalize("dit is tien elf twaalf test", LANG), "dit is 10 11 12 test") self.assertEqual( normalize("dit is dertien veertien test", LANG), "dit is 13 14 test") self.assertEqual( normalize(u"dit is vijftien zestien zeventien", LANG), "dit is 15 16 17") self.assertEqual( normalize("dit is achttien negentien twintig", LANG), "dit is 18 19 20") def test_extract_duration_nl(self): self.assertEqual(extract_duration("een minuut", LANG), (timedelta(seconds=60), "")) self.assertEqual(extract_duration("10 minuten", LANG), (timedelta(seconds=600), "")) self.assertEqual(extract_duration("een uur en 2 minuten", LANG), (timedelta(seconds=3720), "en")) self.assertEqual(extract_duration("een dag", LANG), (timedelta(days=1), "")) self.assertEqual(extract_duration("twee dag", LANG), (timedelta(days=2), "")) self.assertEqual(extract_duration("vijf minuten na het uur", LANG), (timedelta(seconds=300), "na het uur")) self.assertEqual(extract_duration("zet een timer voor 1 uur", LANG), (timedelta(seconds=3600), "zet 1 timer voor")) self.assertEqual(extract_duration("een treinrit van 2 uur, 17 minuten en zestien seconden", LANG), (timedelta(seconds=8236), "1 treinrit van , en")) self.assertEqual(extract_duration("een uurtje", LANG), (timedelta(seconds=3600), "")) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_pl.py000066400000000000000000000740261426211343400222370ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, timedelta from lingua_franca import get_default_lang, set_default_lang, \ load_language, unload_language from lingua_franca.time import default_timezone from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import normalize def setUpModule(): load_language("pl-pl") set_default_lang("pl") def tearDownModule(): unload_language("cs") class TestNormalize(unittest.TestCase): def test_extract_number(self): self.assertEqual(extract_number('to jest pół testu'), 0.5) self.assertEqual(extract_number("to jest pierwszy test", ordinals=True), 1) self.assertEqual(extract_number("to jest 2 test"), 2) self.assertEqual(extract_number("to jest drugi test", ordinals=True), 2) self.assertEqual(extract_number("to jest trzeci test", ordinals=True), 3.0) self.assertEqual(extract_number("czwarty test", ordinals=True), 4.0) self.assertEqual(extract_number("trzydziesty szósty test", ordinals=True), 36.0) self.assertEqual(extract_number("to jest test numer 4"), 4) self.assertEqual(extract_number("jedna trzecia szklanki"), 1.0 / 3.0) self.assertEqual(extract_number("trzy szklanki"), 3) self.assertEqual(extract_number("1/3 szklanki"), 1.0 / 3.0) self.assertEqual(extract_number("jedna czwarta szklanki"), 0.25) self.assertEqual(extract_number("1/4 szklanki"), 0.25) self.assertEqual(extract_number("jedna czwarta szklanki"), 0.25) self.assertEqual(extract_number("2/3 szklanki"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 szklanki"), 3.0 / 4.0) self.assertEqual(extract_number("1 i 3/4 szklanki"), 1.75) self.assertEqual(extract_number("1 szklanka i jedna druga"), 1.5) self.assertEqual(extract_number("jedna szklanka i jedna druga"), 1.5) self.assertEqual(extract_number("jeden i jedna druga szklanki"), 1.5) self.assertEqual(extract_number("trzy czwarte szklanki"), 3.0 / 4.0) self.assertEqual(extract_number("dwadzieścia dwa"), 22) self.assertEqual(extract_number("Dwadzieścia dwa i trzy piąte"), 22.6) self.assertEqual(extract_number("dwieście"), 200) self.assertEqual(extract_number("dziewięć tysięcy"), 9000) self.assertEqual(extract_number("sześćset sześćdziesiąt sześć"), 666) self.assertEqual(extract_number("dwa miliony"), 2000000) self.assertEqual(extract_number("dwa miliony pięćset tysięcy " "ton metalu"), 2500000) self.assertEqual(extract_number("sześć bilionów"), 6000000000000.0) self.assertEqual(extract_number("jeden przecinek pięć"), 1.5) self.assertEqual(extract_number("trzy kropka czternaście"), 3.14) self.assertEqual(extract_number("zero przecinek dwa"), 0.2) self.assertEqual(extract_number("miliardy lat starsze"), 1000000000.0) self.assertEqual(extract_number("sto tysięcy"), 100000) self.assertEqual(extract_number("minus 2"), -2) self.assertEqual(extract_number("ujemne siedemdziesiąt"), -70) self.assertEqual(extract_number("tysiąc milionów"), 1000000000) self.assertEqual(extract_number("sześć trzecich"), 6 / 3) self.assertEqual(extract_number("trzydzieści sekund"), 30) self.assertEqual(extract_number("to jest miliardowy test", ordinals=True), 1e09) self.assertEqual(extract_number("to jest miliardowa część"), 1e-9) # Verify non-power multiples of ten no longer discard # adjacent multipliers self.assertEqual(extract_number("dwadzieścia tysięcy"), 20000) self.assertEqual(extract_number("pięćdziesiąt milionów"), 50000000) # Verify smaller powers of ten no longer cause miscalculation of larger # powers of ten (see MycroftAI#86) self.assertEqual(extract_number("trzysta dwadzieścia miliardów trzysta milionów \ dziewięćset pięćdziesiąt tysięcy sześćset \ siedemdziesiąt pięć kropka osiem"), 320300950675.8) self.assertEqual(extract_number("dziewięćset dziewięćdziesiąt dziewięć milionów \ dziewięćset dziewięćdziesiąt dziewięć tysięcy \ dziewięćset dziewięćdziesiąt dziewięć przecinek dziewięć"), 999999999.9) # TODO why does "trillion" result in xxxx.0? self.assertEqual(extract_number("osiemset bilionów dwieście \ pięćdziesiąt siedem"), 800000000000257.0) self.assertTrue(extract_number("Szybki gracz") is False) self.assertTrue(extract_number("krejzi") is False) self.assertTrue(extract_number("krejzi zero") is not False) self.assertEqual(extract_number("krejzi zero"), 0) self.assertTrue(extract_number("super 0") is not False) self.assertEqual(extract_number("super 0"), 0) self.assertEqual(extract_number( "jesteś drugi", ordinals=True), 2) self.assertEqual(extract_number("całkowicie 100%"), 100) def test_extract_duration_pl(self): self.assertEqual(extract_duration("10 sekund"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 minut"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 godziny"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 dni"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 tygodni"), (timedelta(weeks=25), "")) self.assertEqual(extract_duration("siedem godzin"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 sekundy"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("osiem i pół dnia trzydzieści dziewięć sekund", lang='pl-pl'), (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("Ustaw stoper na 30 minut"), (timedelta(minutes=30), "ustaw stoper na")) self.assertEqual(extract_duration("Cztery i pół minuty do zachodu"), (timedelta(minutes=4.5), "do zachodu")) self.assertEqual(extract_duration("dziewiętnaście minut po pełnej godzinie"), (timedelta(minutes=19), "po pełnej godzinie")) self.assertEqual(extract_duration("obudź mnie za 3 tygodnie, czterysta dziewięćdziesiąt siedem dni i" " trzysta 91.6 sekund"), (timedelta(weeks=3, days=497, seconds=391.6), "obudź mnie za , i")) self.assertEqual(extract_duration("ten film trwa jedną godzinę, pięćdziesiąt siedem i pół minuty", lang='pl-pl'), (timedelta(hours=1, minutes=57.5), "ten film trwa ,")) self.assertEqual(extract_duration("10-sekund"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-minut"), (timedelta(minutes=5), "")) def test_extractdatetime_pl(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm print(text) # TODO Remove me [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("teraz jest czas", "2017-06-27 13:04:00", "jest czas") testExtract("za sekundę", "2017-06-27 13:04:01", "") testExtract("za minutę", "2017-06-27 13:05:00", "") testExtract("następna dekada", "2027-06-27 00:00:00", "") testExtract("za jeden wiek", "2117-06-27 00:00:00", "") testExtract("za jedno milenium", "3017-06-27 00:00:00", "") testExtract("za 5 dekad", "2067-06-27 00:00:00", "") testExtract("za 2 wieki", "2217-06-27 00:00:00", "") testExtract("za godzinę", "2017-06-27 14:04:00", "") testExtract("chcę to do godziny", "2017-06-27 14:04:00", "chcę to") testExtract("za 1 sekundę", "2017-06-27 13:04:01", "") testExtract("za 2 sekundy", "2017-06-27 13:04:02", "") testExtract("Nastaw zasadzkę na za minutę", "2017-06-27 13:05:00", "nastaw zasadzkę") testExtract("Nastaw zasadzkę na pół godziny", "2017-06-27 13:34:00", "nastaw zasadzkę") testExtract("Nastaw zasadzkę za 5 dni od dzisiaj", "2017-07-02 00:00:00", "nastaw zasadzkę") testExtract("pojutrze", "2017-06-29 00:00:00", "") testExtract("Jaka będzie pogoda pojutrze?", "2017-06-29 00:00:00", "jaka będzie pogoda") testExtract("Przypomnij mi o 10:45 po południu", "2017-06-27 22:45:00", "przypomnij mi") testExtract("Jaka będzie pogoda w piątek rano", "2017-06-30 08:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda jutro", "2017-06-28 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda dzisiaj po południu", "2017-06-27 15:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda dzisiaj wieczorem?", "2017-06-27 19:00:00", "jaka będzie pogoda") testExtract("jaka była pogoda dzisiaj rano", "2017-06-27 08:00:00", "jaka była pogoda") testExtract("przypomnij mi bym zadzwonił do mamy za 8 tygodni i 2 dni", "2017-08-24 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy 3 Sierpnia", "2017-08-03 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy jutro o 7 rano", "2017-06-28 07:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi jutro bym zadzwonił do mamy o 9 w nocy", "2017-06-28 21:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi jutro bym zadzwonił do mamy o 7 rano", "2017-06-28 07:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za godzinę", "2017-06-27 14:04:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 1730", "2017-06-27 17:30:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 0630", "2017-06-28 06:30:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 7", "2017-06-27 19:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy w czwartek o 7 wieczorem", "2017-06-29 19:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy w Czwartek o 7 rano", "2017-06-29 07:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 7 rano w Czwartek", "2017-06-29 07:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 2 godziny", "2017-06-27 15:04:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 15 minut", "2017-06-27 13:19:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za piętnaście minut", "2017-06-27 13:19:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za pół godziny", "2017-06-27 13:34:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano 2 dni po Sobocie", "2017-07-03 10:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Zagraj Rick Astley za 2 dni po Piątku", "2017-07-02 00:00:00", "zagraj rick astley") testExtract("Zacznij inwazję o 3:45 po południu", "2017-06-27 15:45:00", "zacznij inwazję") testExtract("W poniedziałek, zamów ciasto z piekarni", "2017-07-03 00:00:00", "zamów ciasto z piekarni") testExtract("Zagraj Wszystkiego Najlepszego za 5 lat od dzisiaj", "2022-06-27 00:00:00", "zagraj wszystkiego najlepszego") testExtract("Skype z Mamą o 12:45 w następny Czwartek", "2017-07-06 12:45:00", "skype z mamą") testExtract("Jaka będzie pogoda w następny Piątek", "2017-06-30 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w następną Środę", "2017-07-05 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w następny Czwartek", "2017-07-06 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w następny piątek rano", "2017-06-30 08:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w następny Piątek wieczorem", "2017-06-30 19:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w następny Piątek po południu", "2017-06-30 15:00:00", "jaka będzie pogoda") testExtract("Przypomnij mi bym zadzwonił do mamy 3 Sierpnia", "2017-08-03 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Kup fajerwerki 4 Lipca", "2017-07-04 00:00:00", "kup fajerwerki") testExtract("Jaka będzie pogoda za 2 tygodnie po następnym Piątku", "2017-07-14 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w Środę o 7 rano", "2017-06-28 07:00:00", "jaka będzie pogoda") testExtract("Ustaw spotkanie na 12:45 w następny Czwartek", "2017-07-06 12:45:00", "ustaw spotkanie") testExtract("Jaka będzie pogoda w ten Czwartek", "2017-06-29 00:00:00", "jaka będzie pogoda") testExtract("Ustaw wizytę na za 2 tygodnie i 6 dni od Soboty", "2017-07-21 00:00:00", "ustaw wizytę na") testExtract("Zacznij inwazję o 03 45 w Czwartek", "2017-06-29 03:45:00", "zacznij inwazję") testExtract("Zacznij inwazję o 8 wieczorem w Czwartek", "2017-06-29 20:00:00", "zacznij inwazję") testExtract("Zacznij inwazję w Czwartek południe", "2017-06-29 12:00:00", "zacznij inwazję") testExtract("Zacznij inwazję w Czwartek o północy", "2017-06-29 00:00:00", "zacznij inwazję") testExtract("Przypomnij mi bym się obudził za 4 lata", "2021-06-27 00:00:00", "przypomnij mi bym się obudził") testExtract("Przypomnij mi bym się obudził za 4 lata i 4 dni", "2021-07-01 00:00:00", "przypomnij mi bym się obudził") testExtract("Jaka będzie pogoda za 3 dni od jutra", "2017-07-01 00:00:00", "jaka będzie pogoda") testExtract("grudzień trzeci", "2017-12-03 00:00:00", "") testExtract("Spotkajmy się o 8 wieczorem", "2017-06-27 20:00:00", "spotkajmy się") testExtract("Spotkajmy się o 5 po południu", "2017-06-27 17:00:00", "spotkajmy się") testExtract("Spotkajmy się o 8 rano", "2017-06-28 08:00:00", "spotkajmy się") testExtract("Przypomnij mi bym się obudził o 8 rano", "2017-06-28 08:00:00", "przypomnij mi bym się obudził") testExtract("Jaka będzie pogoda we Wtorek", "2017-06-27 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w Poniedziałek", "2017-07-03 00:00:00", "jaka będzie pogoda") testExtract("Jaka będzie pogoda w środę", "2017-06-28 00:00:00", "jaka będzie pogoda") testExtract("w Czwartek jaka będzie pogoda", "2017-06-29 00:00:00", "jaka będzie pogoda") testExtract("w ten Czwartek jaka będzie pogoda", "2017-06-29 00:00:00", "jaka będzie pogoda") testExtract("Jaka była pogoda w ostatni Poniedziałek", "2017-06-26 00:00:00", "jaka była pogoda") testExtract("Ustaw alarm na Środę 8 wieczór", "2017-06-28 20:00:00", "ustaw alarm") testExtract("Ustaw alarm na Środę o trzeciej po południu", "2017-06-28 15:00:00", "ustaw alarm") testExtract("Ustaw alarm na Środę o 3 rano", "2017-06-28 03:00:00", "ustaw alarm") testExtract("Ustaw alarm na 7:00 wieczorem", "2017-06-27 19:00:00", "ustaw alarm") testExtract("5 czerwca 2017 wieczorem przypomnij mi bym" + " zadzwonił do mamy", "2017-06-05 19:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("dodaj do mojego kalendarza poranne spotkanie z Juliuszem" + " czwartego Marca", "2018-03-04 08:00:00", "dodaj do mojego kalendarza spotkanie z juliuszem") testExtract("Przypomnij mi bym zadzwonił do mamy w następny Wtorek", "2017-07-04 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 3 tygodnie", "2017-07-18 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 8 tygodni", "2017-08-22 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 8 tygodni i 2 dni", "2017-08-24 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 4 dni", "2017-07-01 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 3 miesiące", "2017-09-27 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy za 2 lata i 2 dni", "2019-06-29 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy w następnym tygodniu", "2017-07-04 00:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano w Sobotę", "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano w tę Sobotę", "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") testExtract("Przypomnij mi bym zadzwonił do mamy o 10 w następną Sobotę", "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") # test yesterday testExtract("Jaki dzień był wczoraj", "2017-06-26 00:00:00", "jaki dzień był") testExtract("Jaki dzień był przedwczoraj", "2017-06-25 00:00:00", "jaki dzień był") testExtract("Miałem kolację wczoraj o 6", "2017-06-26 06:00:00", "miałem kolację") testExtract("Miałem kolację wczoraj o 6 rano", "2017-06-26 06:00:00", "miałem kolację") testExtract("Miałem kolację wczoraj o 6 wieczorem", "2017-06-26 18:00:00", "miałem kolację") # Below two tests, ensure that time is picked # even if no am/pm is specified # in case of weekdays/tonight # TODO imperfect as leaves "dzień robocze", but calculates time correctly testExtract("Nastaw alarm na 9 w dni robocze", "2017-06-27 21:00:00", "nastaw alarm dzień robocze") testExtract("na 8 wieczorem", "2017-06-27 20:00:00", "") testExtract("na 8:30 wieczorem", "2017-06-27 20:30:00", "") # Tests a time with ':' & without am/pm testExtract("nastaw alarm na 9:30 wieczorem", "2017-06-27 21:30:00", "nastaw alarm") testExtract("nastaw alarm na 9:00 wieczorem", "2017-06-27 21:00:00", "nastaw alarm") # Check if it picks the intent irrespective of correctness testExtract("przypomnij mi o grze dzisiaj o 11:30 wieczorem", "2017-06-27 23:30:00", "przypomnij mi o grze") testExtract("ustaw alarm na 7:30 w dni robocze", "2017-06-27 19:30:00", "ustaw alarm w dzień robocze") # "# days " testExtract("moje urodziny są za 2 dni", "2017-06-29 00:00:00", "moje urodziny są") testExtract("moje urodziny są za 2 dni od dzisiaj", "2017-06-29 00:00:00", "moje urodziny są") testExtract("moje urodziny są za 2 dni od jutra", "2017-06-30 00:00:00", "moje urodziny są") testExtract("moje urodziny są 2 dni po jutrze", "2017-06-30 00:00:00", "moje urodziny są") testExtract("przypomnij mi żebym zadzwonił do mamy o 10 rano 2 dni po następnej Sobocie", "2017-07-10 10:00:00", "przypomnij mi żebym zadzwonił do mamy") testExtract("moje urodziny są za 2 dni od wczoraj", "2017-06-28 00:00:00", "moje urodziny są") # "# days ago>" testExtract("moje urodziny były 1 dzień temu", "2017-06-26 00:00:00", "moje urodziny były") testExtract("moje urodziny były 2 dni temu", "2017-06-25 00:00:00", "moje urodziny były") testExtract("moje urodziny były 3 dni temu", "2017-06-24 00:00:00", "moje urodziny były") testExtract("moje urodziny były 4 dni temu", "2017-06-23 00:00:00", "moje urodziny były") testExtract("spotkajmy się w nocy", "2017-06-27 22:00:00", "spotkajmy się") testExtract("jaka będzie pogoda jutro w nocy", "2017-06-28 22:00:00", "jaka będzie pogoda") testExtract("jaka będzie pogoda w następny Wtorek nocy", "2017-07-04 22:00:00", "jaka będzie pogoda") def test_extract_ambiguous_time_pl(self): morning = datetime(2017, 6, 27, 8, 1, 2) evening = datetime(2017, 6, 27, 20, 1, 2) noonish = datetime(2017, 6, 27, 12, 1, 2) self.assertEqual( extract_datetime('nakarm rybę'), None) self.assertEqual( extract_datetime('dzień'), None) self.assertEqual( extract_datetime('tydzień'), None) self.assertEqual( extract_datetime('miesiąc'), None) self.assertEqual( extract_datetime('rok'), None) self.assertEqual( extract_datetime(' '), None) def test_extract_relativedatetime_pl(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("spotkajmy się za 5 minut", "2017-06-27 10:06:02", "spotkajmy się") testExtract("spotkajmy się za 5minut", "2017-06-27 10:06:02", "spotkajmy się") testExtract("spotkajmy się za 5 sekund", "2017-06-27 10:01:07", "spotkajmy się") testExtract("spotkajmy się za 1 godzinę", "2017-06-27 11:01:02", "spotkajmy się") testExtract("spotkajmy się za 2 godziny", "2017-06-27 12:01:02", "spotkajmy się") testExtract("spotkajmy się za 2godziny", "2017-06-27 12:01:02", "spotkajmy się") testExtract("spotkajmy się za 1 minutę", "2017-06-27 10:02:02", "spotkajmy się") testExtract("spotkajmy się za 1 sekundę", "2017-06-27 10:01:03", "spotkajmy się") testExtract("spotkajmy się za 5sekund", "2017-06-27 10:01:07", "spotkajmy się") def test_spaces(self): self.assertEqual(normalize(" to jest test"), "to jest test") self.assertEqual(normalize(" to jest test "), "to jest test") self.assertEqual(normalize(" to jest jeden test"), "to jest 1 test") def test_numbers(self): self.assertEqual(normalize("to jest jeden dwa trzy test"), "to jest 1 2 3 test") self.assertEqual(normalize(" to jest cztery pięć sześć test"), "to jest 4 5 6 test") self.assertEqual(normalize("to jest dziesięć jedenaście dwanaście test"), "to jest 10 11 12 test") self.assertEqual(normalize("to jest osiemnaście dziewiętnaście dwadzieścia"), "to jest 18 19 20") self.assertEqual(normalize("to jest jeden dziewiętnaście dwadzieścia dwa"), "to jest 1 19 20 2") self.assertEqual(normalize("to jest jeden dwa dwadzieścia dwa"), "to jest 1 2 20 2") self.assertEqual(normalize("to jest jeden i pół"), "to jest 1 pół") self.assertEqual(normalize("to jest jeden i pół i pięć sześć"), "to jest 1 pół 5 6") def test_multiple_numbers(self): self.assertEqual(extract_numbers("to jest jeden dwa trzy test"), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers("to jest cztery pięć sześć test"), [4.0, 5.0, 6.0]) self.assertEqual(extract_numbers("to jest dziesięć jedenaście dwanaście test"), [10.0, 11.0, 12.0]) self.assertEqual(extract_numbers("to jest jeden dwadzieścia jeden test"), [1.0, 21.0]) self.assertEqual(extract_numbers("1 pies, siedem świń, macdonald miał " "farmę, 3 razy 5 macarena"), [1, 7, 3, 5]) self.assertEqual(extract_numbers("dwa piwa dwa wina"), [2.0, 2.0]) self.assertEqual(extract_numbers("dwadzieścia 20 dwadzieścia"), [20, 20, 20]) self.assertEqual(extract_numbers("dwadzieścia 20 22"), [20.0, 20.0, 22.0]) self.assertEqual(extract_numbers("dwadzieścia dwadzieścia dwa dwadzieścia"), [20.0, 22.0, 20.0]) self.assertEqual(extract_numbers("dwadzieścia 2"), [22.0]) self.assertEqual(extract_numbers("dwadzieścia 20 dwadzieścia 2"), [20, 20, 22]) self.assertEqual(extract_numbers("jedna trzecia jeden"), [1 / 3, 1]) self.assertEqual(extract_numbers("trzeci", ordinals=True), [3]) self.assertEqual(extract_numbers("sześć trylionów"), [6e18]) self.assertEqual(extract_numbers("dwie świnie i sześć bilionów bakterii", lang='pl-pl'), [2, 6e12]) self.assertEqual(extract_numbers("trzydziesty drugi lub pierwszy", ordinals=True), [32, 1]) self.assertEqual(extract_numbers("to jest siedem osiem dziewięć i" " pół test"), [7.0, 8.0, 9.5]) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_pt.py000066400000000000000000000333421426211343400222430ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.parse import get_gender from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language('pt-pt') set_default_lang('pt') def tearDownModule(): unload_language('pt') class TestNormalize(unittest.TestCase): """ Test cases for Portuguese parsing """ def test_articles_pt(self): self.assertEqual(normalize("isto é o teste", lang="pt", remove_articles=True), "isto é teste") self.assertEqual( normalize("isto é a frase", lang="pt", remove_articles=True), "isto é frase") self.assertEqual( normalize("e outro teste", lang="pt", remove_articles=True), "outro teste") self.assertEqual(normalize("isto é o teste extra", lang="pt", remove_articles=False), "isto é o teste extra") def test_extractnumber_pt(self): self.assertEqual(extract_number("isto e o primeiro teste", lang="pt"), 1) self.assertEqual(extract_number("isto e o 2 teste", lang="pt"), 2) self.assertEqual(extract_number("isto e o segundo teste", lang="pt"), 2) self.assertEqual(extract_number("isto e um terço de teste", lang="pt"), 1.0 / 3.0) self.assertEqual(extract_number("isto e o teste numero quatro", lang="pt"), 4) self.assertEqual(extract_number("um terço de chavena", lang="pt"), 1.0 / 3.0) self.assertEqual(extract_number("3 canecos", lang="pt"), 3) self.assertEqual(extract_number("1/3 canecos", lang="pt"), 1.0 / 3.0) self.assertEqual(extract_number("quarto de hora", lang="pt"), 0.25) self.assertEqual(extract_number("1/4 hora", lang="pt"), 0.25) self.assertEqual(extract_number("um quarto hora", lang="pt"), 0.25) self.assertEqual(extract_number("2/3 pinga", lang="pt"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 pinga", lang="pt"), 3.0 / 4.0) self.assertEqual(extract_number("1 e 3/4 cafe", lang="pt"), 1.75) self.assertEqual(extract_number("1 cafe e meio", lang="pt"), 1.5) self.assertEqual(extract_number("um cafe e um meio", lang="pt"), 1.5) self.assertEqual( extract_number("tres quartos de chocolate", lang="pt"), 3.0 / 4.0) self.assertEqual( extract_number("Tres quartos de chocolate", lang="pt"), 3.0 / 4.0) self.assertEqual(extract_number("três quarto de chocolate", lang="pt"), 3.0 / 4.0) self.assertEqual(extract_number("sete ponto cinco", lang="pt"), 7.5) self.assertEqual(extract_number("sete ponto 5", lang="pt"), 7.5) self.assertEqual(extract_number("sete e meio", lang="pt"), 7.5) self.assertEqual(extract_number("sete e oitenta", lang="pt"), 7.80) self.assertEqual(extract_number("sete e oito", lang="pt"), 7.8) self.assertEqual(extract_number("sete e zero oito", lang="pt"), 7.08) self.assertEqual(extract_number("sete e zero zero oito", lang="pt"), 7.008) self.assertEqual(extract_number("vinte treze avos", lang="pt"), 20.0 / 13.0) self.assertEqual(extract_number("seis virgula seiscentos e sessenta", lang="pt"), 6.66) self.assertEqual(extract_number("seiscentos e sessenta e seis", lang="pt"), 666) self.assertEqual(extract_number("seiscentos ponto zero seis", lang="pt"), 600.06) self.assertEqual(extract_number("seiscentos ponto zero zero seis", lang="pt"), 600.006) self.assertEqual(extract_number("seiscentos ponto zero zero zero seis", lang="pt"), 600.0006) def test_agressive_pruning_pt(self): self.assertEqual(normalize("uma palavra", lang="pt"), "1 palavra") self.assertEqual(normalize("esta palavra um", lang="pt"), "palavra 1") self.assertEqual(normalize("o homem batia-lhe", lang="pt"), "homem batia") self.assertEqual(normalize("quem disse asneira nesse dia", lang="pt"), "quem disse asneira dia") def test_spaces_pt(self): self.assertEqual(normalize(" isto e o teste", lang="pt"), "isto teste") self.assertEqual(normalize(" isto sao os testes ", lang="pt"), "isto sao testes") self.assertEqual(normalize(" isto e um teste", lang="pt", remove_articles=False), "isto 1 teste") def test_numbers_pt(self): self.assertEqual(normalize("isto e o um dois três teste", lang="pt"), "isto 1 2 3 teste") self.assertEqual(normalize("é a sete oito nove test", lang="pt"), "é 7 8 9 test") self.assertEqual( normalize("teste zero dez onze doze treze", lang="pt"), "teste 0 10 11 12 13") self.assertEqual( normalize("teste mil seiscentos e sessenta e seis", lang="pt", remove_articles=False), "teste 1000 600 60 6") self.assertEqual( normalize("teste sete e meio", lang="pt", remove_articles=False), "teste 7 meio") self.assertEqual( normalize("teste dois ponto nove", lang="pt"), "teste 2 ponto 9") self.assertEqual( normalize("teste cento e nove", lang="pt", remove_articles=False), "teste 100 9") self.assertEqual( normalize("teste vinte e 1", lang="pt"), "teste 20 1") def test_extractdatetime_pt(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date, lang="pt") extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("que dia é hoje", "2017-06-27 00:00:00", "dia") testExtract("que dia é amanha", "2017-06-28 00:00:00", "dia") testExtract("que dia foi ontem", "2017-06-26 00:00:00", "dia") testExtract("que dia foi antes de ontem", "2017-06-25 00:00:00", "dia") testExtract("que dia foi ante ontem", "2017-06-25 00:00:00", "dia") testExtract("que dia foi ante ante ontem", "2017-06-24 00:00:00", "dia") testExtract("marca o jantar em 5 dias", "2017-07-02 00:00:00", "marca jantar") testExtract("como esta o tempo para o dia depois de amanha?", "2017-06-29 00:00:00", "como tempo") testExtract("lembra me ás 10:45 pm", "2017-06-27 22:45:00", "lembra") testExtract("como esta o tempo na sexta de manha", "2017-06-30 08:00:00", "como tempo") testExtract("lembra me para ligar a mãe daqui " "a 8 semanas e 2 dias", "2017-08-24 00:00:00", "lembra ligar mae") testExtract("Toca black metal 2 dias a seguir a sexta", "2017-07-02 00:00:00", "toca black metal") testExtract("Toca satanic black metal 2 dias para esta sexta", "2017-07-02 00:00:00", "toca satanic black metal") testExtract("Toca super black metal 2 dias a partir desta sexta", "2017-07-02 00:00:00", "toca super black metal") testExtract("Começa a invasão ás 3:45 pm de quinta feira", "2017-06-29 15:45:00", "comeca invasao") testExtract("na segunda, compra queijo", "2017-07-03 00:00:00", "compra queijo") testExtract("Toca os parabéns daqui a 5 anos", "2022-06-27 00:00:00", "toca parabens") testExtract("manda Skype a Mãe ás 12:45 pm próxima quinta", "2017-06-29 12:45:00", "manda skype mae") testExtract("como está o tempo esta sexta?", "2017-06-30 00:00:00", "como tempo") testExtract("como está o tempo esta sexta de tarde?", "2017-06-30 15:00:00", "como tempo") testExtract("como está o tempo esta sexta as tantas da manha?", "2017-06-30 04:00:00", "como tempo") testExtract("como está o tempo esta sexta a meia noite?", "2017-06-30 00:00:00", "como tempo") testExtract("como está o tempo esta sexta ao meio dia?", "2017-06-30 12:00:00", "como tempo") testExtract("como está o tempo esta sexta ao fim da tarde?", "2017-06-30 19:00:00", "como tempo") testExtract("como está o tempo esta sexta ao meio da manha?", "2017-06-30 10:00:00", "como tempo") testExtract("lembra me para ligar a mae no dia 3 de agosto", "2017-08-03 00:00:00", "lembra ligar mae") testExtract("compra facas no 13º dia de maio", "2018-05-13 00:00:00", "compra facas") testExtract("gasta dinheiro no maio dia 13", "2018-05-13 00:00:00", "gasta dinheiro") testExtract("compra velas a maio 13", "2018-05-13 00:00:00", "compra velas") testExtract("bebe cerveja a 13 maio", "2018-05-13 00:00:00", "bebe cerveja") testExtract("como esta o tempo 1 dia a seguir a amanha", "2017-06-29 00:00:00", "como tempo") testExtract("como esta o tempo ás 0700 horas", "2017-06-27 07:00:00", "como tempo") testExtract("como esta o tempo amanha ás 7 em ponto", "2017-06-28 07:00:00", "como tempo") testExtract("como esta o tempo amanha pelas 2 da tarde", "2017-06-28 14:00:00", "como tempo") testExtract("como esta o tempo amanha pelas 2", "2017-06-28 02:00:00", "como tempo") testExtract("como esta o tempo pelas 2 da tarde da proxima sexta", "2017-06-30 14:00:00", "como tempo") testExtract("lembra-me de acordar em 4 anos", "2021-06-27 00:00:00", "lembra acordar") testExtract("lembra-me de acordar em 4 anos e 4 dias", "2021-07-01 00:00:00", "lembra acordar") testExtract("dorme 3 dias depois de amanha", "2017-07-02 00:00:00", "dorme") testExtract("marca consulta para 2 semanas e 6 dias depois de Sabado", "2017-07-21 00:00:00", "marca consulta") testExtract("começa a festa ás 8 em ponto da noite de quinta", "2017-06-29 20:00:00", "comeca festa") def test_extractdatetime_default_pt(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime( 'marca consulta para 2 semanas e 6 dias depois de Sabado', anchor, lang='pt-pt', default_time=default) self.assertEqual(default, res[0].time()) class TestExtractGender(unittest.TestCase): def test_gender_pt(self): # words with well defined grammatical gender rules self.assertEqual(get_gender("vaca", lang="pt"), "f") self.assertEqual(get_gender("cavalo", lang="pt"), "m") self.assertEqual(get_gender("vacas", lang="pt"), "f") # words specifically defined in a lookup dictionary self.assertEqual(get_gender("homem", lang="pt"), "m") self.assertEqual(get_gender("mulher", lang="pt"), "f") self.assertEqual(get_gender("homems", lang="pt"), "m") self.assertEqual(get_gender("mulheres", lang="pt"), "f") # words where gender rules do not work but context does self.assertEqual(get_gender("boi", lang="pt"), None) self.assertEqual(get_gender("boi", "o boi come erva", lang="pt"), "m") self.assertEqual(get_gender("homem", "este homem come bois", lang="pt"), "m") self.assertEqual(get_gender("ponte", lang="pt"), None) self.assertEqual(get_gender("ponte", "essa ponte caiu", lang="pt"), "f") if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_ru.py000066400000000000000000001463031426211343400222500ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, timedelta from lingua_franca import set_default_lang, \ load_language, unload_language from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers from lingua_franca.parse import fuzzy_match from lingua_franca.parse import match_one from lingua_franca.parse import normalize from lingua_franca.time import default_timezone def setUpModule(): load_language("ru-ru") set_default_lang("ru") def tearDownModule(): unload_language("ru") class TestFuzzyMatch(unittest.TestCase): def test_matches(self): self.assertTrue(fuzzy_match("ты и мы", "ты и мы") >= 1.0) self.assertTrue(fuzzy_match("ты и мы", "ты") < 0.5) self.assertTrue(fuzzy_match("Ты", "ты") >= 0.5) self.assertTrue(fuzzy_match("ты и мы", "ты") == fuzzy_match("ты", "ты и мы")) self.assertTrue(fuzzy_match("ты и мы", "он или они") < 0.36) def test_match_one(self): # test list of choices choices = ['фрэнк', 'кейт', 'гарри', 'генри'] self.assertEqual(match_one('фрэнк', choices)[0], 'фрэнк') self.assertEqual(match_one('фрэн', choices)[0], 'фрэнк') self.assertEqual(match_one('енри', choices)[0], 'генри') self.assertEqual(match_one('кэтт', choices)[0], 'кейт') # test dictionary of choices choices = {'фрэнк': 1, 'кейт': 2, 'гарри': 3, 'генри': 4} self.assertEqual(match_one('фрэнк', choices)[0], 1) self.assertEqual(match_one('енри', choices)[0], 4) class TestNormalize(unittest.TestCase): def test_extract_number(self): self.assertEqual(extract_number("это первый тест", ordinals=True), 1) self.assertEqual(extract_number("это 2 тест"), 2) self.assertEqual(extract_number("это второй тест", ordinals=True), 2) # self.assertEqual(extract_number("этот один третий тест"), 1.0 / 3.0) self.assertEqual(extract_number("этот один третий тест", ordinals=True), 3.0) self.assertEqual(extract_number("это четвёртый", ordinals=True), 4.0) self.assertEqual(extract_number( "это тридцать шестой", ordinals=True), 36.0) self.assertEqual(extract_number("это тест на число 4"), 4) self.assertEqual(extract_number("одна треть чашки"), 1.0 / 3.0) self.assertEqual(extract_number("три чашки"), 3) self.assertEqual(extract_number("1/3 чашки"), 1.0 / 3.0) self.assertEqual(extract_number("четверть чашки"), 0.25) self.assertEqual(extract_number("одна четвёртая чашки"), 0.25) self.assertEqual(extract_number("1/4 чашки"), 0.25) self.assertEqual(extract_number("2/3 чашки"), 2.0 / 3.0) self.assertEqual(extract_number("3/4 чашки"), 3.0 / 4.0) self.assertEqual(extract_number("1 и 3/4 чашки"), 1.75) self.assertEqual(extract_number("1 чашка с половиной"), 1.5) self.assertEqual(extract_number("один чашка с половиной"), 1.5) self.assertEqual(extract_number("одна и половина чашки"), 1.5) self.assertEqual(extract_number("одна с половиной чашка"), 1.5) self.assertEqual(extract_number("одна и одна половина чашки"), 1.5) # self.assertEqual(extract_number("три четверти чашки"), 3.0 / 4.0) # self.assertEqual(extract_number("три четвёртые чашки"), 3.0 / 4.0) self.assertEqual(extract_number("двадцать два"), 22) self.assertEqual(extract_number( "Двадцать два с заглавной буквой в начале"), 22) self.assertEqual(extract_number( "Двадцать Два с двумя заглавными буквами"), 22) self.assertEqual(extract_number( "двадцать Два с другой заглавной буквой"), 22) # self.assertEqual(extract_number("Двадцать два и Три Пятых"), 22.6) self.assertEqual(extract_number("двести"), 200) self.assertEqual(extract_number("девять тысяч"), 9000) self.assertEqual(extract_number("шестьсот шестьдесят шесть"), 666) self.assertEqual(extract_number("два миллиона"), 2000000) self.assertEqual(extract_number("два миллиона пятьсот тысяч " "тонн чугуна"), 2500000) self.assertEqual(extract_number("шесть триллионов"), 6000000000000.0) self.assertEqual(extract_number("шесть триллионов", short_scale=False), 6e+18) self.assertEqual(extract_number("один точка пять"), 1.5) self.assertEqual(extract_number("три точка четырнадцать"), 3.14) self.assertEqual(extract_number("ноль точка два"), 0.2) self.assertEqual(extract_number("миллиард лет"), 1000000000.0) self.assertEqual(extract_number("биллион лет", short_scale=False), 1000000000000.0) self.assertEqual(extract_number("сто тысяч"), 100000) self.assertEqual(extract_number("минус 2"), -2) self.assertEqual(extract_number("минус семьдесят"), -70) self.assertEqual(extract_number("тысяча миллионов"), 1000000000) self.assertEqual(extract_number("миллиард", short_scale=False), 1000000000) # self.assertEqual(extract_number("шестая треть"), # 1 / 6 / 3) # self.assertEqual(extract_number("шестая треть", ordinals=True), # 6) self.assertEqual(extract_number("тридцать секунд"), 30) self.assertEqual(extract_number("тридцать два", ordinals=True), 32) self.assertEqual(extract_number("вот это миллиардный тест", ordinals=True), 1e09) self.assertEqual(extract_number("вот это одна миллиардная теста"), 1e-9) self.assertEqual(extract_number("вот это биллионный тест", ordinals=True, short_scale=False), 1e12) # self.assertEqual(extract_number("вот это одна биллионная теста", # short_scale=False), 1e-12) # Verify non-power multiples of ten no longer discard # adjacent multipliers self.assertEqual(extract_number("двадцать тысяч"), 20000) self.assertEqual(extract_number("пятьдесят миллионов"), 50000000) # Verify smaller powers of ten no longer cause miscalculation of larger # powers of ten (see MycroftAI#86) self.assertEqual(extract_number("двадцать миллиардов триста миллионов " "девятьсот пятьдесят тысяч " "шестьсот семьдесят пять точка восемь"), 20300950675.8) self.assertEqual(extract_number("девятьсот девяносто девять миллионов " "девятьсот девяносто девять тысяч " "девятьсот девяносто девять точка девять"), 999999999.9) # TODO why does "trillion" result in xxxx.0? self.assertEqual(extract_number("восемьсот триллионов двести \ пятьдесят семь"), 800000000000257.0) # TODO handle this case # self.assertEqual( # extract_number("6 dot six six six"), # 6.666) self.assertTrue(extract_number("Теннисист скорый") is False) self.assertTrue(extract_number("хрупкий") is False) self.assertTrue(extract_number("хрупкий ноль") is not False) self.assertEqual(extract_number("хрупкий ноль"), 0) self.assertTrue(extract_number("грубый 0") is not False) self.assertEqual(extract_number("грубый 0"), 0) self.assertEqual(extract_number("пара пива"), 2) # self.assertEqual(extract_number("пара сотен пив"), 200) self.assertEqual(extract_number("пара тысяч пив"), 2000) self.assertEqual(extract_number( "вот это 7 тест", ordinals=True), 7) self.assertEqual(extract_number( "вот это 7 тест", ordinals=False), 7) self.assertTrue(extract_number("вот это n. тест") is False) self.assertEqual(extract_number("вот это 1. тест"), 1) self.assertEqual(extract_number("вот это 2. тест"), 2) self.assertEqual(extract_number("вот это 3. тест"), 3) self.assertEqual(extract_number("вот это 31. тест"), 31) self.assertEqual(extract_number("вот это 32. тест"), 32) self.assertEqual(extract_number("вот это 33. тест"), 33) self.assertEqual(extract_number("вот это 34. тест"), 34) self.assertEqual(extract_number("в общем 100%"), 100) def test_extract_duration_ru(self): self.assertEqual(extract_duration("10 секунд"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 минут"), (timedelta(minutes=5), "")) self.assertEqual(extract_duration("2 часа"), (timedelta(hours=2), "")) self.assertEqual(extract_duration("3 дня"), (timedelta(days=3), "")) self.assertEqual(extract_duration("25 недель"), (timedelta(weeks=25), "")) self.assertEqual(extract_duration("семь часов"), (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 секунд"), (timedelta(seconds=7.5), "")) self.assertEqual(extract_duration("восемь с половиной дней " "тридцать девять секунд"), (timedelta(days=8.5, seconds=39), "")) self.assertEqual(extract_duration("Установи таймер на 30 минут"), (timedelta(minutes=30), "установи таймер на")) self.assertEqual(extract_duration("Четыре с половиной минуты до" " заката"), (timedelta(minutes=4.5), "до заката")) self.assertEqual(extract_duration("девятнадцать минут через час"), (timedelta(minutes=19), "через час")) # self.assertEqual(extract_duration("разбуди меня через три недели, " # "четыреста девяносто семь дней " # "и триста 91.6 секунд"), # (timedelta(weeks=3, days=497, seconds=391.6), # "разбуди меня через , a")) self.assertEqual(extract_duration("фильм один час пятьдесят семь" " и пол минуты длиной"), (timedelta(hours=1, minutes=57.5), "фильм длиной")) self.assertEqual(extract_duration("10-секунд"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5-минут"), (timedelta(minutes=5), "")) def test_extractdatetime_ru(self): def extractWithFormat(text): # Tue June 27, 2017 @ 1:04pm date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("теперь пора", "2017-06-27 13:04:00", "пора") self.u = "секунду" testExtract("через %s" % self.u, "2017-06-27 13:04:01", "") testExtract("через минуту", "2017-06-27 13:05:00", "") testExtract("через две минуты", "2017-06-27 13:06:00", "") # testExtract("через пару минут", # "2017-06-27 13:06:00", "") testExtract("через два часа", "2017-06-27 15:04:00", "") # testExtract("через пару часов", # "2017-06-27 15:04:00", "") testExtract("через две недели", "2017-07-11 00:00:00", "") # testExtract("через пару недель", # "2017-07-11 00:00:00", "") testExtract("через два месяца", "2017-08-27 00:00:00", "") testExtract("через два года", "2019-06-27 00:00:00", "") # testExtract("через пару месяцев", # "2017-08-27 00:00:00", "") # testExtract("через пару лет", # "2019-06-27 00:00:00", "") testExtract("через десятилетие", "2027-06-27 00:00:00", "") # testExtract("через пару десятилетий", # "2037-06-27 00:00:00", "") testExtract("следующее десятилетие", "2027-06-27 00:00:00", "") testExtract("через столетие", "2117-06-27 00:00:00", "") testExtract("через тысячелетие", "3017-06-27 00:00:00", "") # testExtract("через два десятилетия", # "2037-06-27 00:00:00", "") # testExtract("через 5 десятилетий", # "2067-06-27 00:00:00", "") # testExtract("через два века", # "2217-06-27 00:00:00", "") # testExtract("через пару веков", # "2217-06-27 00:00:00", "") # testExtract("через два тысячелетия", # "4017-06-27 00:00:00", "") # testExtract("через две тысячи лет", # "4017-06-27 00:00:00", "") # testExtract("через пару тысячелетий", # "4017-06-27 00:00:00", "") # testExtract("через пару тысяч лет", # "4017-06-27 00:00:00", "") testExtract("через год", "2018-06-27 00:00:00", "") testExtract("хочу мороженое через час", "2017-06-27 14:04:00", "хочу мороженое") testExtract("через 1 секунду", "2017-06-27 13:04:01", "") testExtract("через 2 секунды", "2017-06-27 13:04:02", "") testExtract("Установи таймер на 1 минуту", "2017-06-27 13:05:00", "установи таймер") testExtract("Установи таймер на пол часа", "2017-06-27 13:34:00", "установи таймер") # testExtract("Установи таймер на 5 дней с сегодня", # "2017-07-02 00:00:00", "установи таймер") testExtract("послезавтра", "2017-06-29 00:00:00", "") testExtract("после завтра", "2017-06-29 00:00:00", "") testExtract("Какая погода послезавтра?", "2017-06-29 00:00:00", "какая погода") testExtract("Напомни мне в 10:45 pm", "2017-06-27 22:45:00", "напомни мне") testExtract("Напомни мне в 10:45 вечера", "2017-06-27 22:45:00", "напомни мне") testExtract("какая погода в пятницу утром", "2017-06-30 08:00:00", "какая погода") testExtract("какая завтра погода", "2017-06-28 00:00:00", "какая погода") testExtract("какая погода сегодня днём", "2017-06-27 15:00:00", "какая погода") testExtract("какая погода сегодня вечером", "2017-06-27 19:00:00", "какая погода") testExtract("какая была погода сегодня утром", "2017-06-27 08:00:00", "какая была погода") testExtract("напомни мне позвонить маме через 8 недель и 2 дня", "2017-08-24 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в августе 3", "2017-08-03 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 7am", "2017-06-28 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 7утра", "2017-06-28 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 10pm", "2017-06-28 22:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 7 вечера", "2017-06-28 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 10 вечера", "2017-06-28 22:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 7 часов вечера", "2017-06-28 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне завтра позвонить маме в 10 часов вечера", "2017-06-28 22:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7am", "2017-06-28 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7утра", "2017-06-28 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через час", "2017-06-27 14:04:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 1730", "2017-06-27 17:30:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 0630", "2017-06-28 06:30:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 06 30 часов", "2017-06-28 06:30:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 06 30", "2017-06-28 06:30:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 06 30 часа", "2017-06-28 06:30:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7 часов", "2017-06-27 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме вечером в 7 часов", "2017-06-27 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7 часов вечером", "2017-06-27 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7 часов утра", "2017-06-28 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в четверг вечером в 7 часов", "2017-06-29 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в четверг утром в 7 часов", "2017-06-29 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7 часов в четверг утром", "2017-06-29 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7:00 в четверг утром", "2017-06-29 07:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 7:00 в четверг вечером", "2017-06-29 19:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 8 вечера среды", "2017-06-28 20:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 8 в среду вечером", "2017-06-28 20:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме вечером среды в 8", "2017-06-28 20:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через два часа", "2017-06-27 15:04:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 2 часа", "2017-06-27 15:04:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 15 минут", "2017-06-27 13:19:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через пятнадцать минут", "2017-06-27 13:19:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через пол часа", "2017-06-27 13:34:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через четверть часа", "2017-06-27 13:19:00", "напомни мне позвонить маме") # testExtract("напомни мне позвонить маме в 10am на 2 день после этой субботы", # "2017-07-03 10:00:00", "напомни мне позвонить маме") testExtract("Слушайте музыку Рика Эстли через 2 дня с пятницы", "2017-07-02 00:00:00", "слушайте музыку рика эстли") testExtract("Начать вторжение в 3:45 pm в четверг", "2017-06-29 15:45:00", "начать вторжение") testExtract("Начать вторжение в 3:45 вечера в четверг", "2017-06-29 15:45:00", "начать вторжение") testExtract("Начать вторжение в 3:45 дня в четверг", "2017-06-29 15:45:00", "начать вторжение") testExtract("В понедельник закажи торт из пекарни", "2017-07-03 00:00:00", "закажи торт из пекарни") testExtract("Включи музыку с днем рождения через 5 лет", "2022-06-27 00:00:00", "включи музыку с днем рождения") testExtract("Скайп Маме в 12:45 pm в следующий четверг", "2017-07-06 12:45:00", "скайп маме") testExtract("Скайп Маме в 12:45 дня в следующий четверг", "2017-07-06 12:45:00", "скайп маме") testExtract("Какая погода в следующую пятницу?", "2017-06-30 00:00:00", "какая погода") testExtract("Какая погода в следующую среду?", "2017-07-05 00:00:00", "какая погода") testExtract("Какая погода в следующий четверг?", "2017-07-06 00:00:00", "какая погода") testExtract("Какая погода в следующую пятницу утром", "2017-06-30 08:00:00", "какая погода") testExtract("какая погода в следующую пятницу вечером", "2017-06-30 19:00:00", "какая погода") testExtract("какая погода в следующую пятницу днём", "2017-06-30 15:00:00", "какая погода") testExtract("какая погода в следующую пятницу в полдень", "2017-06-30 12:00:00", "какая погода") testExtract("напомни мне позвонить маме третьего августа", "2017-08-03 00:00:00", "напомни мне позвонить маме") # testExtract("купить фейерверк в 4 в четверг", # "2017-07-04 00:00:00", "купить фейерверк") testExtract("какая погода через 2 недели со следующей пятницы", "2017-07-14 00:00:00", "какая погода") testExtract("какая погода в среду в 0700 часов", "2017-06-28 07:00:00", "какая погода") testExtract("Поставь будильник в среду в 7 часов", "2017-06-28 07:00:00", "поставь будильник") testExtract("Назначь встречу в 12:45 pm в следующий четверг", "2017-07-06 12:45:00", "назначь встречу") testExtract("Назначь встречу в 12:45 дня в следующий четверг", "2017-07-06 12:45:00", "назначь встречу") testExtract("Какая погода в этот четверг?", "2017-06-29 00:00:00", "какая погода") testExtract("назначь встречу через 2 недели и 6 дней с субботы", "2017-07-21 00:00:00", "назначь встречу") testExtract("Начать вторжение в 03 45 в четверг", "2017-06-29 03:45:00", "начать вторжение") testExtract("Начать вторжение в 800 часов в четверг", "2017-06-29 08:00:00", "начать вторжение") testExtract("Начать вечеринку в 8 часов вечером в четверг", "2017-06-29 20:00:00", "начать вечеринку") testExtract("Начать вторжение в 8 вечера в четверг", "2017-06-29 20:00:00", "начать вторжение") testExtract("Начать вторжение в четверг в полдень", "2017-06-29 12:00:00", "начать вторжение") testExtract("Начать вторжение в четверг в полночь", "2017-06-29 00:00:00", "начать вторжение") testExtract("Начать вторжение в четверг в 0500", "2017-06-29 05:00:00", "начать вторжение") testExtract("напомни мне встать через 4 года", "2021-06-27 00:00:00", "напомни мне встать") testExtract("напомни мне встать через 4 года и 4 дня", "2021-07-01 00:00:00", "напомни мне встать") # testExtract("какая погода 3 дня после завтра?", # "2017-07-01 00:00:00", "какая погода") testExtract("3 декабря", "2017-12-03 00:00:00", "") testExtract("мы встретимся в 8:00 сегодня вечером", "2017-06-27 20:00:00", "мы встретимся") testExtract("мы встретимся в 5pm", "2017-06-27 17:00:00", "мы встретимся") testExtract("мы встретимся в 5дня", "2017-06-27 17:00:00", "мы встретимся") testExtract("мы встретимся в 8 am", "2017-06-28 08:00:00", "мы встретимся") testExtract("мы встретимся в 8 утра", "2017-06-28 08:00:00", "мы встретимся") testExtract("мы встретимся в 8 вечера", "2017-06-27 20:00:00", "мы встретимся") testExtract("напомнить мне встать в 8 am", "2017-06-28 08:00:00", "напомнить мне встать") testExtract("напомнить мне встать в 8 утра", "2017-06-28 08:00:00", "напомнить мне встать") testExtract("какая погода во вторник", "2017-06-27 00:00:00", "какая погода") testExtract("какая погода в понедельник", "2017-07-03 00:00:00", "какая погода") testExtract("какая погода в эту среду", "2017-06-28 00:00:00", "какая погода") testExtract("в четверг какая погода", "2017-06-29 00:00:00", "какая погода") testExtract("в этот четверг какая погода", "2017-06-29 00:00:00", "какая погода") testExtract("в прошлый понедельник какая была погода", "2017-06-26 00:00:00", "какая была погода") testExtract("поставь будильник на среду вечером в 8", "2017-06-28 20:00:00", "поставь будильник") testExtract("поставь будильник на среду в 3 часа дня", "2017-06-28 15:00:00", "поставь будильник") testExtract("поставь будильник на среду в 3 часа утра", "2017-06-28 03:00:00", "поставь будильник") testExtract("поставь будильник на среду утром в 7 часов", "2017-06-28 07:00:00", "поставь будильник") testExtract("поставь будильник на сегодня в 7 часов", "2017-06-27 19:00:00", "поставь будильник") testExtract("поставь будильник на этот вечер в 7 часов", "2017-06-27 19:00:00", "поставь будильник") testExtract("поставь будильник на этот вечер в 7:00", "2017-06-27 19:00:00", "поставь будильник") # testExtract("поставь будильник этим вечером в 7:00", # "2017-06-27 19:00:00", "поставь будильник") testExtract("вечером 5 июня 2017 напомни мне позвонить маме", "2017-06-05 19:00:00", "напомни мне позвонить маме") testExtract("обнови мой календарь утром свидание с юлиусом" + " 4 марта", "2018-03-04 08:00:00", "обнови мой календарь свидание с юлиусом") testExtract("напомни мне позвонить маме в следующий вторник", "2017-07-04 00:00:00", "напомни мне позвонить маме") # testExtract("напомни мне позвонить маме 3 недели", # "2017-07-18 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 8 недель", "2017-08-22 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 8 недель и 2 дня", "2017-08-24 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 4 дня", "2017-07-01 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 3 месяца", "2017-09-27 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме через 2 года и 2 дня", "2019-06-29 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме на следующей неделе", "2017-07-04 00:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 10am в субботу", "2017-07-01 10:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 10 утра в субботу", "2017-07-01 10:00:00", "напомни мне позвонить маме") # testExtract("напомни мне позвонить маме в 10am в эту субботу", # "2017-07-01 10:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 10 в следующую субботу", "2017-07-01 10:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 10am в следующую субботу", "2017-07-01 10:00:00", "напомни мне позвонить маме") testExtract("напомни мне позвонить маме в 10 утра в следующую субботу", "2017-07-01 10:00:00", "напомни мне позвонить маме") # test yesterday testExtract("какой был день вчера", "2017-06-26 00:00:00", "какой был день") testExtract("какой был день позавчера", "2017-06-25 00:00:00", "какой был день") testExtract("я позавтракал вчера в 6", "2017-06-26 06:00:00", "я позавтракал") testExtract("я позавтракал вчера в 6 am", "2017-06-26 06:00:00", "я позавтракал") testExtract("я позавтракал вчера в 6 утра", "2017-06-26 06:00:00", "я позавтракал") # Below two tests, ensure that time is picked # even if no am/pm is specified # in case of weekdays/tonight testExtract("поставь будильник на 9 в выходные", "2017-06-27 21:00:00", "поставь будильник выходные") testExtract("на 8 сегодня вечером", "2017-06-27 20:00:00", "") testExtract("на 8:30pm сегодня вечером", "2017-06-27 20:30:00", "") testExtract("на 8:30вечера сегодня", "2017-06-27 20:30:00", "") testExtract("на 8:30 вечера сегодня", "2017-06-27 20:30:00", "") # Tests a time with ':' & without am/pm testExtract("поставь будильник сегодня вечером на 9:30", "2017-06-27 21:30:00", "поставь будильник") testExtract("поставь будильник на 9:00 сегодня вечером", "2017-06-27 21:00:00", "поставь будильник") # Check if it picks intent irrespective of correctness testExtract("поставь будильник в 9 часов сегодня вечером", "2017-06-27 21:00:00", "поставь будильник") testExtract("напомни мне об игре сегодня вечером в 11:30", "2017-06-27 23:30:00", "напомни мне об игре") testExtract("поставь будильник в 7:30 на выходных", "2017-06-27 19:30:00", "поставь будильник на выходных") # "# days " testExtract("мой день рождения через 2 дня с сегодня", "2017-06-29 00:00:00", "мой день рождения") testExtract("мой день рождения через 2 дня от сегодня", "2017-06-29 00:00:00", "мой день рождения") testExtract("мой день рождения через 2 дня с завтра", "2017-06-30 00:00:00", "мой день рождения") testExtract("мой день рождения через 2 дня от завтра", "2017-06-30 00:00:00", "мой день рождения") # testExtract("напомни мне позвонить маме в 10am через 2 дня после следующей субботы", # "2017-07-10 10:00:00", "напомни мне позвонить маме") testExtract("мой день рождения через 2 дня со вчера", "2017-06-28 00:00:00", "мой день рождения") testExtract("мой день рождения через 2 дня от вчера", "2017-06-28 00:00:00", "мой день рождения") # "# days ago>" testExtract("мой день рождения был 1 день назад", "2017-06-26 00:00:00", "мой день рождения был") testExtract("мой день рождения был 2 дня назад", "2017-06-25 00:00:00", "мой день рождения был") testExtract("мой день рождения был 3 дня назад", "2017-06-24 00:00:00", "мой день рождения был") testExtract("мой день рождения был 4 дня назад", "2017-06-23 00:00:00", "мой день рождения был") testExtract("мой день рождения был 5 дней назад", "2017-06-22 00:00:00", "мой день рождения был") testExtract("встретимся сегодня ночью", "2017-06-27 22:00:00", "встретимся ночью") testExtract("встретимся позже ночью", "2017-06-27 22:00:00", "встретимся позже ночью") testExtract("какая будет погода завтра ночью", "2017-06-28 22:00:00", "какая будет погода ночью") testExtract("какая будет погода в следующий вторник ночью", "2017-07-04 22:00:00", "какая будет погода ночью") def test_extract_ambiguous_time_ru(self): morning = datetime(2017, 6, 27, 8, 1, 2, tzinfo=default_timezone()) evening = datetime(2017, 6, 27, 20, 1, 2, tzinfo=default_timezone()) noonish = datetime(2017, 6, 27, 12, 1, 2, tzinfo=default_timezone()) self.assertEqual(extract_datetime('кормление рыб'), None) self.assertEqual(extract_datetime('день'), None) # self.assertEqual(extract_datetime('сегодня'), None) self.assertEqual(extract_datetime('месяц'), None) self.assertEqual(extract_datetime('год'), None) self.assertEqual(extract_datetime(' '), None) self.assertEqual( extract_datetime('покормить рыб в 10 часов', morning)[0], datetime(2017, 6, 27, 10, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('покормить рыб в 10 часов', noonish)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) self.assertEqual( extract_datetime('покормить рыб в 10 часов', evening)[0], datetime(2017, 6, 27, 22, 0, 0, tzinfo=default_timezone())) def test_extract_relativedatetime_ru(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(normalize(text)) self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) testExtract("мы встретимся через 5 минут", "2017-06-27 10:06:02", "мы встретимся") # testExtract("мы встретимся через 5минут", # "2017-06-27 10:06:02", "мы встретимся") testExtract("мы встретимся через 5 секунд", "2017-06-27 10:01:07", "мы встретимся") testExtract("мы встретимся через 1 час", "2017-06-27 11:01:02", "мы встретимся") testExtract("мы встретимся через 2 часа", "2017-06-27 12:01:02", "мы встретимся") testExtract("мы встретимся через 1 минуту", "2017-06-27 10:02:02", "мы встретимся") testExtract("мы встретимся через 1 секунду", "2017-06-27 10:01:03", "мы встретимся") # testExtract("мы встретимся через 5секунд", # "2017-06-27 10:01:07", "мы встретимся") def test_spaces(self): self.assertEqual(normalize(" вот это тест"), "вот это тест") self.assertEqual(normalize(" вот это тест "), "вот это тест") self.assertEqual(normalize(" вот это один тест"), "вот это 1 тест") def test_numbers(self): self.assertEqual(normalize("вот это один два три тест"), "вот это 1 2 3 тест") self.assertEqual(normalize(" вот это четыре пять шесть тест"), "вот это 4 5 6 тест") self.assertEqual(normalize("вот это семь восемь девять тест"), "вот это 7 8 9 тест") self.assertEqual(normalize("вот это семь восемь девять тест"), "вот это 7 8 9 тест") self.assertEqual(normalize("вот это десять одиннадцать двенадцать тест"), "вот это 10 11 12 тест") self.assertEqual(normalize("вот это тринадцать четырнадцать тест"), "вот это 13 14 тест") self.assertEqual(normalize("вот это пятнадцать шестнадцать семнадцать"), "вот это 15 16 17") self.assertEqual(normalize("вот это восемнадцать девятнадцать двадцать"), "вот это 18 19 20") self.assertEqual(normalize("вот это один девятнадцать двадцать два"), "вот это 1 19 20 2") self.assertEqual(normalize("вот это один сто"), "вот это 1 сто") self.assertEqual(normalize("вот это один два двадцать два"), "вот это 1 2 20 2") self.assertEqual(normalize("вот это один и половина"), "вот это 1 и половина") self.assertEqual(normalize("вот это один и половина и пять шесть"), "вот это 1 и половина и 5 6") def test_multiple_numbers(self): self.assertEqual(extract_numbers("вот это один два три тест"), [1.0, 2.0, 3.0]) self.assertEqual(extract_numbers("вот это четыре пять шесть тест"), [4.0, 5.0, 6.0]) self.assertEqual(extract_numbers("вот это десять одиннадцать двенадцать тест"), [10.0, 11.0, 12.0]) self.assertEqual(extract_numbers("вот это один двадцать один тест"), [1.0, 21.0]) self.assertEqual(extract_numbers("1 собака, семь свиней, у макдонадьда " "была ферма ферма, 3 раза по 5 макарен"), [1, 7, 3, 5]) # self.assertEqual(extract_numbers("два пива для двух медведей"), # [2.0, 2.0]) self.assertEqual(extract_numbers("двадцать 20 двадцать"), [20, 20, 20]) self.assertEqual(extract_numbers("двадцать 20 22"), [20.0, 20.0, 22.0]) self.assertEqual(extract_numbers("двадцать двадцать два двадцать"), [20, 22, 20]) self.assertEqual(extract_numbers("двадцать 2"), [22.0]) self.assertEqual(extract_numbers("двадцать 20 двадцать 2"), [20, 20, 22]) self.assertEqual(extract_numbers("треть один"), [1 / 3, 1]) self.assertEqual(extract_numbers("третий", ordinals=True), [3]) self.assertEqual(extract_numbers("шесть триллионов", short_scale=True), [6e12]) self.assertEqual(extract_numbers("шесть триллионов", short_scale=False), [6e18]) self.assertEqual(extract_numbers("два поросёнка и шесть триллионов бактерий", short_scale=True), [2, 6e12]) self.assertEqual(extract_numbers("два поросёнка и шесть триллионов бактерий", short_scale=False), [2, 6e18]) self.assertEqual(extract_numbers("тридцать второй или первый", ordinals=True), [32, 1]) self.assertEqual(extract_numbers("вот это семь восемь девять и" " половина тест"), [7.0, 8.0, 9.5]) if __name__ == "__main__": unittest.main() lingua-franca-release-v0.4.3/test/test_parse_sv.py000066400000000000000000000173511426211343400222520ustar00rootroot00000000000000# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import unittest from datetime import datetime, time, timedelta from lingua_franca import load_language, unload_language from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number from lingua_franca.parse import extract_duration from lingua_franca.parse import normalize def setUpModule(): load_language('sv-se') def tearDownModule(): unload_language('sv-se') class TestNormalize(unittest.TestCase): def test_extractnumber_sv(self): self.assertEqual(extract_number("1 och en halv deciliter", lang='sv-se'), 1.5) self.assertEqual(extract_number("det här är det första testet", lang='sv-se'), 1) self.assertEqual(extract_number("det här är test nummer 2", lang='sv-se'), 2) self.assertEqual(extract_number("det här är det andra testet", lang='sv-se'), 2) self.assertEqual(extract_number("det här är tredje testet", lang='sv-se'), 3) self.assertEqual(extract_number("det här är test nummer 4", lang='sv-se'), 4) self.assertEqual(extract_number("en tredjedels dl", lang='sv-se'), 1.0 / 3.0) self.assertEqual(extract_number("tre deciliter", lang='sv-se'), 3) self.assertEqual(extract_number("Tre deciliter", lang='sv-se'), 3) self.assertEqual(extract_number("1/3 deciliter", lang='sv-se'), 1.0 / 3.0) self.assertEqual(extract_number("en kvarts dl", lang='sv-se'), 0.25) self.assertEqual(extract_number("1/4 dl", lang='sv-se'), 0.25) self.assertEqual(extract_number("en kvarts dl", lang='sv-se'), 0.25) self.assertEqual(extract_number("2/3 dl", lang='sv-se'), 2.0 / 3.0) self.assertEqual(extract_number("3/4 dl", lang='sv-se'), 3.0 / 4.0) self.assertEqual(extract_number("1 och 3/4 dl", lang='sv-se'), 1.75) self.assertEqual(extract_number("tre fjärdedels dl", lang='sv-se'), 3.0 / 4.0) self.assertEqual(extract_number("trekvarts kopp", lang='sv-se'), 3.0 / 4.0) def test_extractdatetime_sv(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 0, 0) [extractedDate, leftover] = extract_datetime(text, date, lang='sv-se') extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] def testExtract(text, expected_date, expected_leftover): res = extractWithFormat(text) self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) testExtract("Planera bakhållet 5 dagar från nu", "2017-07-02 00:00:00", "planera bakhållet") testExtract("Vad blir vädret i övermorgon?", "2017-06-29 00:00:00", "vad blir vädret") testExtract("Påminn mig klockan 10:45", "2017-06-27 10:45:00", "påminn mig klockan") testExtract("vad blir vädret på fredag morgon", "2017-06-30 08:00:00", "vad blir vädret") testExtract("vad blir morgondagens väder", "2017-06-28 00:00:00", "vad blir väder") testExtract("påminn mig att ringa mamma om 8 veckor och 2 dagar", "2017-08-24 00:00:00", "påminn mig att ringa mamma om och") testExtract("Spela Kurt Olssons musik 2 dagar från Fredag", "2017-07-02 00:00:00", "spela kurt olssons musik") testExtract("vi möts 20:00", "2017-06-27 20:00:00", "vi möts") def test_extractdatetime_default_sv(self): default = time(9, 0, 0) anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime('påminn mig att klippa mig på fredag', anchor, lang='sv-se', default_time=default) self.assertEqual(default, res[0].time()) def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" self.assertEqual(extract_datetime('Ingen tid', lang='sv-se'), None) def test_numbers(self): self.assertEqual(normalize("det här är ett ett två tre test", lang='sv-se'), "det här är 1 1 2 3 test") self.assertEqual(normalize(" det är fyra fem sex test", lang='sv-se'), "det är 4 5 6 test") self.assertEqual(normalize("det är sju åtta nio test", lang='sv-se'), "det är 7 8 9 test") self.assertEqual(normalize("det är tio elva tolv test", lang='sv-se'), "det är 10 11 12 test") self.assertEqual(normalize("det är arton nitton tjugo test", lang='sv-se'), "det är 18 19 20 test") class TestExtractDuration(unittest.TestCase): def test_valid_extract_duration(self): """Duration in sentence.""" td, remains = extract_duration("5 minuter", lang='sv-se') self.assertEqual(td, timedelta(seconds=300)) self.assertEqual(remains, '') td, remains = extract_duration("om 2 och en halv timme", lang='sv-se') self.assertEqual(td, timedelta(hours=2, minutes=30)) self.assertEqual(remains, "om och") td, remains = extract_duration("starta en 9 minuters timer", lang='sv-se') self.assertEqual(td, timedelta(minutes=9)) self.assertEqual(remains, "starta timer") # Extraction of things like "kvart" and "halvtimme" td, remains = extract_duration("i en kvart", lang='sv-se') self.assertEqual(td, timedelta(minutes=15)) self.assertEqual(remains, "i") td, remains = extract_duration("hämta mig om två timmar och en kvart", lang='sv-se') self.assertEqual(td, timedelta(hours=2, minutes=15)) self.assertEqual(remains, "hämta mig om och") td, remains = extract_duration("om en halvtimme", lang='sv-se') self.assertEqual(td, timedelta(minutes=30)) self.assertEqual(remains, "om") def test_invalid_extract_duration(self): """No duration in sentence.""" res = extract_duration("vad är en myrslok", lang='sv-se') self.assertEqual(res, None) res = extract_duration("svaret är 42", lang='sv-se') self.assertEqual(res, None) if __name__ == "__main__": unittest.main()