pax_global_header00006660000000000000000000000064136627122460014523gustar00rootroot0000000000000052 comment=eba781b6212869f0098a1b53e22db7cd33d6002c padatious-0.4.8/000077500000000000000000000000001366271224600135255ustar00rootroot00000000000000padatious-0.4.8/.gitignore000066400000000000000000000002201366271224600155070ustar00rootroot00000000000000build/ dist/ .idea/ *.egg-info intent_cache/ __pycache__/ *.pyc .pytest_cache/ *venv/ .cache/ .coverage* coverage.xml /.project /.pydevproject padatious-0.4.8/.pep8speaks.yml000077500000000000000000000000461366271224600164140ustar00rootroot00000000000000pycodestyle: max-line-length: 100 padatious-0.4.8/LICENSE000077500000000000000000000216631366271224600145450ustar00rootroot00000000000000Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: You must give any other recipients of the Work or Derivative Works a copy of this License; and You must cause any modified files to carry prominent notices stating that You changed the files; and You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS padatious-0.4.8/MANIFEST.in000077500000000000000000000000311366271224600152600ustar00rootroot00000000000000include requirements.txt padatious-0.4.8/README.md000066400000000000000000000041651366271224600150120ustar00rootroot00000000000000[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE.md) [![CLA](https://img.shields.io/badge/CLA%3F-Required-blue.svg)](https://mycroft.ai/cla) [![Team](https://img.shields.io/badge/Team-Mycroft_Core-violetblue.svg)](https://github.com/MycroftAI/contributors/blob/master/team/Mycroft%20Core.md) ![Status](https://img.shields.io/badge/-Production_ready-green.svg) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](http://makeapullrequest.com) [![Join chat](https://img.shields.io/badge/Mattermost-join_chat-brightgreen.svg)](https://chat.mycroft.ai/community/channels/machine-learning) # Padatious An efficient and agile neural network intent parser. Padatious is a core component of [Mycroft AI](https://mycroft.ai). ## Features - Intents are easy to create - Requires a relatively small amount of data - Intents run independent of each other - Easily extract entities (ie. Find the nearest *gas station* -> `place: gas station`) - Fast training with a modular approach to neural networks ## Getting Started ### Installing Padatious requires the following native packages to be installed: - [`FANN`][fann] (with dev headers) - Python development headers - `pip3` - `swig` Ubuntu: ``` sudo apt-get install libfann-dev python3-dev python3-pip swig ``` Next, install Padatious via `pip3`: ``` pip3 install padatious ``` Padatious also works in Python 2 if you are unable to upgrade. [fann]:https://github.com/libfann/fann ### Example Here's a simple example of how to use Padatious: #### program.py ```Python from padatious import IntentContainer container = IntentContainer('intent_cache') container.add_intent('hello', ['Hi there!', 'Hello.']) container.add_intent('goodbye', ['See you!', 'Goodbye!']) container.add_intent('search', ['Search for {query} (using|on) {engine}.']) container.train() print(container.calc_intent('Hello there!')) print(container.calc_intent('Search for cats on CatTube.')) container.remove_intent('goodbye') ``` Run with: ```bash python3 program.py ``` ## Learn More Further documentation can be found at https://mycroft.ai/documentation/padatious/ padatious-0.4.8/demo.py000077500000000000000000000016721366271224600150340ustar00rootroot00000000000000#!/usr/bin/env python3 # Sample Padatious program used for testing import sys from builtins import input from glob import glob from os.path import basename from padatious import IntentContainer reload_cache = len(sys.argv) > 1 and sys.argv[1] == '-r' container = IntentContainer('intent_cache') for file_name in glob('data/*.intent'): name = basename(file_name).replace('.intent', '') container.load_file(name, file_name, reload_cache=reload_cache) for file_name in glob('data/*.entity'): name = basename(file_name).replace('.entity', '') container.load_entity(name, file_name, reload_cache=reload_cache) container.train() query = None while query != 'q': try: query = input('> ') except (KeyboardInterrupt, EOFError): print() break data = container.calc_intent(query) print(data.name + ': ' + str(data.conf)) for key, val in data.matches.items(): print('\t' + key + ': ' + val) padatious-0.4.8/docs/000077500000000000000000000000001366271224600144555ustar00rootroot00000000000000padatious-0.4.8/docs/.gitignore000066400000000000000000000000351366271224600164430ustar00rootroot00000000000000_build/ _static/ _templates/ padatious-0.4.8/docs/Makefile000066400000000000000000000011431366271224600161140ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python3 -msphinx SPHINXPROJ = Padatious SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) padatious-0.4.8/docs/conf.py000066400000000000000000000031011366271224600157470ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Padatious documentation build configuration file # import os import sys sys.path.insert(0, os.path.abspath('../')) # General Configuration extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon'] autodoc_mock_imports = ['fann2.libfann', 'xxhash'] templates_path = ['_templates'] source_suffix = '.rst' master_doc = 'index' # General Info project = 'Padatious' copyright = '2017, Mycroft AI' author = 'Matthew Scholefield' version = '0.1.0' release = '0.1.0' # Includes alpha/beta/rc tags. language = None exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # Syntax Highlighting pygments_style = 'sphinx' todo_include_todos = False import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_theme_options = { 'navigation_depth': 4, } html_static_path = [] htmlhelp_basename = 'Padatiousdoc' # Options for LaTeX output latex_elements = {} latex_documents = [ (master_doc, 'Padatious.tex', 'Padatious Documentation', 'Matthew Scholefield', 'manual'), ] # Options for manual page output man_pages = [ (master_doc, 'padatious', 'Padatious Documentation', [author], 1) ] # Options for Texinfo output texinfo_documents = [ (master_doc, 'Padatious', 'Padatious Documentation', author, 'Padatious', 'Neural Network Intent Parser.', 'Miscellaneous'), ] # Options for Napoleon napoleon_google_docstring = True napoleon_numpy_docstring = False padatious-0.4.8/docs/index.rst000066400000000000000000000005641366271224600163230ustar00rootroot00000000000000.. Padatious documentation master file Padatious ========= *An efficient and agile neural network intent parser* .. toctree:: :maxdepth: 4 :caption: Contents: **IntentContainer** =================== .. autoclass:: padatious.IntentContainer :members: :undoc-members: **MatchData** ============= .. autoclass:: padatious.MatchData :members: padatious-0.4.8/format.ebnf000066400000000000000000000004241366271224600156510ustar00rootroot00000000000000 ::= * | '(' ')' | | ::= | ::= ( | '' ) ( '|' ) ::= ( '|' ) ( | '' ) ::= '{' '}' ::= ('a'| ... |'$'|'('|')')padatious-0.4.8/padatious/000077500000000000000000000000001366271224600155165ustar00rootroot00000000000000padatious-0.4.8/padatious/__init__.py000066400000000000000000000013071366271224600176300ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .intent_container import IntentContainer from .match_data import MatchData __version__ = '0.4.7' # Also change in setup.py padatious-0.4.8/padatious/__main__.py000066400000000000000000000040571366271224600176160ustar00rootroot00000000000000import inspect import json from os.path import basename, splitext from argparse import ArgumentParser from padatious import IntentContainer def train_setup(parser): parser.add_argument('intent_cache', help='Folder to write trained intents to') parser.add_argument('input_files', nargs='*', help='Input .intent and .entity files') parser.add_argument('-d', '--data', help='Serialized training args', type=json.loads) parser.add_argument('-s', '--single-thread', help='Run training in a single thread') parser.add_argument('-f', '--force', help='Force retraining if already trained') parser.add_argument('-a', '--args', help='Extra args (list) for function', type=json.loads) parser.add_argument('-k', '--kwargs', help='Extra kwargs (json) for function', type=json.loads) def train(parser, args): if bool(args.input_files) == bool(args.data): parser.error('You must specify one of input_files or --data (but not both)') cont = IntentContainer(args.intent_cache) if args.data: cont.apply_training_args(args.data) else: for fn in args.input_files: obj_name, ext = splitext(basename(fn)) if ext == '.intent': cont.load_intent(obj_name, fn) elif ext == '.entity': cont.load_entity(obj_name, fn) else: parser.error('Unknown file extension: {}'.format(ext)) kwargs = inspect.signature(cont.train).bind(*(args.args or [])).arguments kwargs.update(args.kwargs or {}) kwargs.setdefault('debug', True) kwargs.setdefault('single_thread', args.single_thread) kwargs.setdefault('force', args.force) if cont.train(**kwargs): return 0 return 10 # timeout def main(): parser = ArgumentParser(description='Tool to interact with padatious via command line') p = parser.add_subparsers(dest='action') p.required = True train_setup(p.add_parser('train')) args = parser.parse_args() if args.action == 'train': exit(train(parser, args)) if __name__ == '__main__': main() padatious-0.4.8/padatious/bracket_expansion.py000066400000000000000000000132611366271224600215720ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class Fragment(object): """(Abstract) empty sentence fragment""" def __init__(self, tree): """ Construct a sentence tree fragment which is merely a wrapper for a list of Strings Args: tree (?): Base tree for the sentence fragment, type depends on subclass, refer to those subclasses """ self._tree = tree def tree(self): """Return the represented sentence tree as raw data.""" return self._tree def expand(self): """ Expanded version of the fragment. In this case an empty sentence. Returns: List>: A list with an empty sentence (= token/string list) """ return [[]] def __str__(self): return self._tree.__str__() def __repr__(self): return self._tree.__repr__() class Word(Fragment): """ Single word in the sentence tree. Construct with a string as argument. """ def expand(self): """ Creates one sentence that contains exactly that word. Returns: List>: A list with the given string as sentence (= token/string list) """ return [[self._tree]] class Sentence(Fragment): """ A Sentence made of several concatenations/words. Construct with a List as argument. """ def expand(self): """ Creates a combination of all sub-sentences. Returns: List>: A list with all subsentence expansions combined in every possible way """ old_expanded = [[]] for sub in self._tree: sub_expanded = sub.expand() new_expanded = [] while len(old_expanded) > 0: sentence = old_expanded.pop() for new in sub_expanded: new_expanded.append(sentence + new) old_expanded = new_expanded return old_expanded class Options(Fragment): """ A Combination of possible sub-sentences. Construct with List as argument. """ def expand(self): """ Returns all of its options as seperated sub-sentences. Returns: List>: A list containing the sentences created by all expansions of its sub-sentences """ options = [] for option in self._tree: options.extend(option.expand()) return options class SentenceTreeParser(object): """ Generate sentence token trees from a list of tokens ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] """ def __init__(self, tokens): self.tokens = tokens def _parse(self): """ Generate sentence token trees ['1', '(', '2', '|', '3, ')'] -> ['1', ['2', '3']] """ self._current_position = 0 return self._parse_expr() def _parse_expr(self): """ Generate sentence token trees from the current position to the next closing parentheses / end of the list and return it ['1', '(', '2', '|', '3, ')'] -> ['1', [['2'], ['3']]] ['2', '|', '3'] -> [['2'], ['3']] """ # List of all generated sentences sentence_list = [] # Currently active sentence cur_sentence = [] sentence_list.append(Sentence(cur_sentence)) # Determine which form the current expression has while self._current_position < len(self.tokens): cur = self.tokens[self._current_position] self._current_position += 1 if cur == '(': # Parse the subexpression subexpr = self._parse_expr() # Check if the subexpression only has one branch # -> If so, append "(" and ")" and add it as is normal_brackets = False if len(subexpr.tree()) == 1: normal_brackets = True cur_sentence.append(Word('(')) # add it to the sentence cur_sentence.append(subexpr) if normal_brackets: cur_sentence.append(Word(')')) elif cur == '|': # Begin parsing a new sentence cur_sentence = [] sentence_list.append(Sentence(cur_sentence)) elif cur == ')': # End parsing the current subexpression break # TODO anything special about {sth}? else: cur_sentence.append(Word(cur)) return Options(sentence_list) def _expand_tree(self, tree): """ Expand a list of sub sentences to all combinated sentences. ['1', ['2', '3']] -> [['1', '2'], ['1', '3']] """ return tree.expand() def expand_parentheses(self): tree = self._parse() return self._expand_tree(tree) padatious-0.4.8/padatious/entity.py000066400000000000000000000033221366271224600174040ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from os.path import join from padatious.simple_intent import SimpleIntent from padatious.trainable import Trainable class Entity(SimpleIntent, Trainable): def __init__(self, name, *args, **kwargs): SimpleIntent.__init__(self, name) Trainable.__init__(self, name, *args, **kwargs) @staticmethod def verify_name(token): if token[0] in '{}' or token[-1] in '{}': raise ValueError('token must not be surrounded in braces (ie. {word} should be word)') @staticmethod def wrap_name(name): """Wraps SkillName:entity into SkillName:{entity}""" if ':' in name: parts = name.split(':') intent_name, ent_name = parts[0], parts[1:] return intent_name + ':{' + ':'.join(ent_name) + '}' else: return '{' + name + '}' def save(self, folder): prefix = join(folder, self.name) SimpleIntent.save(self, prefix) self.save_hash(prefix) @classmethod def from_file(cls, name, folder): self = super(Entity, cls).from_file(name, join(folder, name)) self.load_hash(join(folder, name)) return self padatious-0.4.8/padatious/entity_edge.py000066400000000000000000000107131366271224600203720ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from fann2 import libfann as fann from padatious.id_manager import IdManager from padatious.util import StrEnum, resolve_conflicts class Ids(StrEnum): end = ':end' class EntityEdge(object): """ Represents the left or right side of an entity (a PosIntent) Args: direction (int): -1 for left and +1 for right token (str): token to attach to (something like {word}) intent_name (str): name of parent intent """ def __init__(self, direction, token, intent_name): self.ids = IdManager(Ids) self.intent_name = intent_name self.token = token self.dir = direction self.net = None def get_end(self, sent): return len(sent) if self.dir > 0 else -1 def vectorize(self, sent, pos): unknown = 0 vector = self.ids.vector() end_pos = self.get_end(sent) for i in range(pos + self.dir, end_pos, self.dir): if sent[i] in self.ids: self.ids.assign(vector, sent[i], 1.0 / abs(i - pos)) else: unknown += 1 self.ids.assign(vector, Ids.end, 1.0 / abs(end_pos - pos)) return vector def match(self, sent, pos): return self.net.run(self.vectorize(sent, pos))[0] def configure_net(self): layers = [len(self.ids), 3, 1] self.net = fann.neural_net() self.net.create_standard_array(layers) self.net.set_activation_function_hidden(fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_activation_function_output(fann.SIGMOID_STEPWISE) self.net.set_train_stop_function(fann.STOPFUNC_BIT) self.net.set_bit_fail_limit(0.1) def save(self, prefix): prefix += '.' + {-1: 'l', +1: 'r'}[self.dir] self.net.save(str(prefix + '.net')) # Must have str() self.ids.save(prefix) def load(self, prefix): prefix += '.' + {-1: 'l', +1: 'r'}[self.dir] self.net = fann.neural_net() if not self.net.create_from_file(str(prefix + '.net')): # Must have str() raise FileNotFoundError(str(prefix + '.net')) self.ids.load(prefix) def train(self, train_data): for sent in train_data.my_sents(self.intent_name): if self.token in sent: for i in range(sent.index(self.token) + self.dir, self.get_end(sent), self.dir): if sent[i][0] != '{': self.ids.add_token(sent[i]) inputs, outputs = [], [] def pollute(sent, i, out_val): """Simulates multiple token words in adjacent entities""" for j, check_token in enumerate(sent): d = j - i if int(d > 0) - int(d < 0) == self.dir and check_token.startswith('{'): for pol_len in range(1, 4): s = sent[:j] + [':0'] * pol_len + sent[j + 1:] p = i + (pol_len - 1) * int(self.dir < 0) inputs.append(self.vectorize(s, p)) outputs.append([out_val]) def add_sents(sents, out_fn): for sent in sents: for i, token in enumerate(sent): out_val = out_fn(token) inputs.append(self.vectorize(sent, i)) outputs.append([out_val]) if out_val == 1.0: pollute(sent, i, 1.0) add_sents(train_data.my_sents(self.intent_name), lambda x: float(x == self.token)) add_sents(train_data.other_sents(self.intent_name), lambda x: 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) data = fann.training_data() data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(data, 1000, 0, 0) self.net.test_data(data) if self.net.get_bit_fail() == 0: break padatious-0.4.8/padatious/entity_manager.py000066400000000000000000000025471366271224600211060ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.entity import Entity from padatious.training_manager import TrainingManager class EntityManager(TrainingManager): def __init__(self, cache): super(EntityManager, self).__init__(Entity, cache) self.entity_dict = {} def calc_ent_dict(self): for i in self.objects: self.entity_dict[i.name] = i def find(self, intent_name, token): local_name, global_name = '', token if ':' in intent_name: local_name = intent_name.split(':')[0] + ':' + token return self.entity_dict.get(local_name, self.entity_dict.get(global_name)) def remove(self, name): name = '{' + name + '}' if name in self.entity_dict: del self.entity_dict[name] super(EntityManager, self).remove(name) padatious-0.4.8/padatious/id_manager.py000066400000000000000000000036431366271224600201640ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from padatious.util import StrEnum class IdManager(object): """ Gives manages specific unique identifiers for tokens. Used to convert tokens to vectors """ def __init__(self, id_cls=StrEnum, ids=None): if ids is not None: self.ids = ids else: self.ids = {} for i in id_cls.values(): self.add_token(i) def __len__(self): return len(self.ids) @staticmethod def adj_token(token): if token.isdigit(): for i in range(10): if str(i) in token: token = token.replace(str(i), '#') return token def vector(self): return [0.0] * len(self.ids) def save(self, prefix): with open(prefix + '.ids', 'w') as f: json.dump(self.ids, f) def load(self, prefix): with open(prefix + '.ids', 'r') as f: self.ids = json.load(f) def assign(self, vector, key, val): vector[self.ids[self.adj_token(key)]] = val def __contains__(self, token): return self.adj_token(token) in self.ids def add_token(self, token): token = self.adj_token(token) if token not in self.ids: self.ids[token] = len(self.ids) def add_sent(self, sent): for token in sent: self.add_token(token) padatious-0.4.8/padatious/intent.py000066400000000000000000000055341366271224600174000ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import math from os.path import join from padatious.match_data import MatchData from padatious.pos_intent import PosIntent from padatious.simple_intent import SimpleIntent from padatious.trainable import Trainable class Intent(Trainable): """Full intent object to handle entity extraction and intent matching""" def __init__(self, *args, **kwargs): super(Intent, self).__init__(*args, **kwargs) self.simple_intent = SimpleIntent(self.name) self.pos_intents = [] def match(self, sent, entities=None): possible_matches = [MatchData(self.name, sent)] for pi in self.pos_intents: entity = entities.find(self.name, pi.token) if entities else None for i in list(possible_matches): possible_matches += pi.match(i, entity) possible_matches = [i for i in possible_matches if i.conf >= 0.0] for i in possible_matches: conf = ((i.conf / len(i.matches)) if len(i.matches) > 0 else 0) + 0.5 i.conf = math.sqrt(conf * self.simple_intent.match(i.sent)) return max(possible_matches, key=lambda x: x.conf) def save(self, folder): prefix = join(folder, self.name) with open(prefix + '.hash', 'wb') as f: f.write(self.hash) self.simple_intent.save(prefix) prefix += '.pos' with open(prefix, 'w') as f: json.dump([i.token for i in self.pos_intents], f) for pos_intent in self.pos_intents: pos_intent.save(prefix) @classmethod def from_file(cls, name, folder): self = cls(name) prefix = join(folder, name) self.load_hash(prefix) self.simple_intent = SimpleIntent.from_file(name, prefix) prefix += '.pos' with open(prefix, 'r') as f: tokens = json.load(f) for token in tokens: self.pos_intents.append(PosIntent.from_file(prefix, token)) return self def train(self, train_data): tokens = set([token for sent in train_data.my_sents(self.name) for token in sent if token.startswith('{')]) self.pos_intents = [PosIntent(i, self.name) for i in tokens] self.simple_intent.train(train_data) for i in self.pos_intents: i.train(train_data) padatious-0.4.8/padatious/intent_container.py000066400000000000000000000267101366271224600214410ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import json import os import padaos import sys from functools import wraps from subprocess import call, check_output from threading import Thread from padatious.match_data import MatchData from padatious.entity import Entity from padatious.entity_manager import EntityManager from padatious.intent_manager import IntentManager from padatious.util import tokenize def _save_args(func): @wraps(func) def wrapper(*args, **kwargs): func(*args, **kwargs) bound_args = inspect.signature(func).bind(*args, **kwargs) bound_args.apply_defaults() kwargs = bound_args.arguments kwargs['__name__'] = func.__name__ kwargs.pop('self').serialized_args.append(kwargs) return wrapper class IntentContainer(object): """ Creates an IntentContainer object used to load and match intents Args: cache_dir (str): Place to put all saved neural networks """ def __init__(self, cache_dir): os.makedirs(cache_dir, exist_ok=True) self.cache_dir = cache_dir self.must_train = False self.intents = IntentManager(cache_dir) self.entities = EntityManager(cache_dir) self.padaos = padaos.IntentContainer() self.train_thread = None # type: Thread self.serialized_args = [] # Arguments of all calls to register intents/entities def clear(self): os.makedirs(self.cache_dir, exist_ok=True) self.must_train = False self.intents = IntentManager(self.cache_dir) self.entities = EntityManager(self.cache_dir) self.padaos = padaos.IntentContainer() self.train_thread = None self.serialized_args = [] def instantiate_from_disk(self): """ Instantiates the necessary (internal) data structures when loading persisted model from disk. This is done via injecting entities and intents back from cached file versions. """ entity_traindata = {} intent_traindata = {} # workaround: load training data for both entities and intents since # padaos regex needs it for (re)compilation until TODO is cleared for f in os.listdir(self.cache_dir): if f.endswith('.entity'): entity_name = f[0:f.find('.entity')] with open(os.path.join(self.cache_dir, f), 'r') as d: entity_traindata[entity_name] = [line.strip() for line in d] elif f.endswith('.intent'): intent_name = f[0:f.find('.intent')] with open(os.path.join(self.cache_dir, f), 'r') as d: intent_traindata[intent_name] = [line.strip() for line in d] # TODO: padaos.compile (regex compilation) is redone when loading: find # a way to persist regex, as well! for f in os.listdir(self.cache_dir): if f.startswith('{') and f.endswith('}.hash'): entity_name = f[1:f.find('}.hash')] self.add_entity( name=entity_name, lines=entity_traindata[entity_name], reload_cache=False, must_train=False) elif not f.startswith('{') and f.endswith('.hash'): intent_name = f[0:f.find('.hash')] self.add_intent( name=intent_name, lines=intent_traindata[intent_name], reload_cache=False, must_train=False) @_save_args def add_intent(self, name, lines, reload_cache=False, must_train=True): """ Creates a new intent, optionally checking the cache first Args: name (str): The associated name of the intent lines (list): All the sentences that should activate the intent reload_cache: Whether to ignore cached intent if exists """ self.intents.add(name, lines, reload_cache, must_train) self.padaos.add_intent(name, lines) self.must_train = must_train @_save_args def add_entity(self, name, lines, reload_cache=False, must_train=True): """ Adds an entity that matches the given lines. Example: self.add_intent('weather', ['will it rain on {weekday}?']) self.add_entity('weekday', ['monday', 'tuesday', 'wednesday']) # ... Args: name (str): The name of the entity lines (list): Lines of example extracted entities reload_cache (bool): Whether to refresh all of cache """ Entity.verify_name(name) self.entities.add( Entity.wrap_name(name), lines, reload_cache, must_train) self.padaos.add_entity(name, lines) self.must_train = must_train @_save_args def load_entity( self, name, file_name, reload_cache=False, must_train=True): """ Loads an entity, optionally checking the cache first Args: name (str): The associated name of the entity file_name (str): The location of the entity file reload_cache (bool): Whether to refresh all of cache """ Entity.verify_name(name) self.entities.load(Entity.wrap_name(name), file_name, reload_cache) with open(file_name) as f: self.padaos.add_entity(name, f.read().split('\n')) self.must_train = must_train @_save_args def load_file(self, *args, **kwargs): """Legacy. Use load_intent instead""" self.load_intent(*args, **kwargs) @_save_args def load_intent( self, name, file_name, reload_cache=False, must_train=True): """ Loads an intent, optionally checking the cache first Args: name (str): The associated name of the intent file_name (str): The location of the intent file reload_cache (bool): Whether to refresh all of cache """ self.intents.load(name, file_name, reload_cache) with open(file_name) as f: self.padaos.add_intent(name, f.read().split('\n')) self.must_train = must_train @_save_args def remove_intent(self, name): """Unload an intent""" self.intents.remove(name) self.padaos.remove_intent(name) self.must_train = True @_save_args def remove_entity(self, name): """Unload an entity""" self.entities.remove(name) self.padaos.remove_entity(name) def _train(self, *args, **kwargs): t1 = Thread( target=self.intents.train, args=args, kwargs=kwargs, daemon=True) t2 = Thread( target=self.entities.train, args=args, kwargs=kwargs, daemon=True) t1.start() t2.start() t1.join() t2.join() self.entities.calc_ent_dict() def train(self, debug=True, force=False, single_thread=False, timeout=20): """ Trains all the loaded intents that need to be updated If a cache file exists with the same hash as the intent file, the intent will not be trained and just loaded from file Args: debug (bool): Whether to print a message to stdout each time a new intent is trained force (bool): Whether to force training if already finished single_thread (bool): Whether to force running in a single thread timeout (float): Seconds before cancelling training Returns: bool: True if training succeeded without timeout """ if not self.must_train and not force: return self.padaos.compile() self.train_thread = Thread(target=self._train, kwargs=dict( debug=debug, single_thread=single_thread, timeout=timeout ), daemon=True) self.train_thread.start() self.train_thread.join(timeout) self.must_train = False return not self.train_thread.is_alive() def train_subprocess(self, *args, **kwargs): """ Trains in a subprocess which provides a timeout guarantees everything shuts down properly Args: See Returns: bool: True for success, False if timed out """ ret = call([ sys.executable, '-m', 'padatious', 'train', self.cache_dir, '-d', json.dumps(self.serialized_args), '-a', json.dumps(args), '-k', json.dumps(kwargs), ]) if ret == 2: raise TypeError( 'Invalid train arguments: {} {}'.format( args, kwargs)) data = self.serialized_args self.clear() self.apply_training_args(data) self.padaos.compile() if ret == 0: self.must_train = False return True elif ret == 10: # timeout return False else: raise ValueError( 'Training failed and returned code: {}'.format(ret)) def calc_intents(self, query): """ Tests all the intents against the query and returns data on how well each one matched against the query Args: query (str): Input sentence to test against intents Returns: list: List of intent matches See calc_intent() for a description of the returned MatchData """ if self.must_train: self.train() intents = {} if self.train_thread and self.train_thread.is_alive() else { i.name: i for i in self.intents.calc_intents(query, self.entities) } sent = tokenize(query) for perfect_match in self.padaos.calc_intents(query): name = perfect_match['name'] intents[name] = MatchData( name, sent, matches=perfect_match['entities'], conf=1.0) return list(intents.values()) def calc_intent(self, query): """ Tests all the intents against the query and returns match data of the best intent Args: query (str): Input sentence to test against intents Returns: MatchData: Best intent match """ matches = self.calc_intents(query) if len(matches) == 0: return MatchData('', '') best_match = max(matches, key=lambda x: x.conf) best_matches = ( match for match in matches if match.conf == best_match.conf) return min(best_matches, key=lambda x: sum( map(len, x.matches.values()))) def get_training_args(self): return self.serialized_args def apply_training_args(self, data): for params in data: func_name = params.pop('__name__') getattr(self, func_name)(**params) padatious-0.4.8/padatious/intent_manager.py000066400000000000000000000022021366271224600210570ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.intent import Intent from padatious.match_data import MatchData from padatious.training_manager import TrainingManager from padatious.util import tokenize class IntentManager(TrainingManager): def __init__(self, cache): super(IntentManager, self).__init__(Intent, cache) def calc_intents(self, query, entity_manager): sent = tokenize(query) matches = [] for i in self.objects: match = i.match(sent, entity_manager) match.detokenize() matches.append(match) return matches padatious-0.4.8/padatious/match_data.py000066400000000000000000000056631366271224600201670ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class MatchData(object): """ A set of data describing how a query fits into an intent Attributes: name (str): Name of matched intent sent (str): The query after entity extraction conf (float): Confidence (from 0.0 to 1.0) matches (dict of str -> str): Key is the name of the entity and value is the extracted part of the sentence """ def __init__(self, name, sent, matches=None, conf=0.0): self.name = name self.sent = sent self.matches = matches or {} self.conf = conf def __getitem__(self, item): return self.matches.__getitem__(item) def __contains__(self, item): return self.matches.__contains__(item) def get(self, key, default=None): return self.matches.get(key, default) def __repr__(self): return repr(self.__dict__) @staticmethod def handle_apostrophes(old_sentence): """ Attempts to handle utterances with apostrophes in them """ new_sentence = '' apostrophe_present = False for word in old_sentence: if word == "'": apostrophe_present = True new_sentence += word else: # If the apostrophe is present we don't want to add # a whitespace after the apostrophe if apostrophe_present: # If the word after the apostrophe is longer than a character long assume that # the previous word is an "s" + apostrophe instead of "word + apostrophe if len(word) > 1: new_sentence += " " + word else: new_sentence += word apostrophe_present = False else: if len(new_sentence) > 0: new_sentence += " " + word else: new_sentence = word return new_sentence # Converts parameters from lists of tokens to one combined string def detokenize(self): self.sent = self.handle_apostrophes(self.sent) new_matches = {} for token, sent in self.matches.items(): new_token = token.replace('{', '').replace('}', '') new_matches[new_token] = self.handle_apostrophes(sent) self.matches = new_matches padatious-0.4.8/padatious/pos_intent.py000066400000000000000000000056301366271224600202560ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from padatious.entity_edge import EntityEdge from padatious.match_data import MatchData class PosIntent(object): """ Positional intent Used to extract entities Args: token (str): token to attach to (something like {word}) """ def __init__(self, token, intent_name=''): self.token = token self.edges = [EntityEdge(-1, token, intent_name), EntityEdge(+1, token, intent_name)] def match(self, orig_data, entity=None): l_matches = [(self.edges[0].match(orig_data.sent, pos), pos) for pos in range(len(orig_data.sent))] r_matches = [(self.edges[1].match(orig_data.sent, pos), pos) for pos in range(len(orig_data.sent))] def is_valid(l_pos, r_pos): if r_pos < l_pos: return False for p in range(l_pos, r_pos + 1): if orig_data.sent[p].startswith('{'): return False return True possible_matches = [] for l_conf, l_pos in l_matches: if l_conf < 0.2: continue for r_conf, r_pos in r_matches: if r_conf < 0.2: continue if not is_valid(l_pos, r_pos): continue extracted = orig_data.sent[l_pos:r_pos + 1] pos_conf = (l_conf - 0.5 + r_conf - 0.5) / 2 + 0.5 ent_conf = (entity.match(extracted) if entity else 1) new_sent = orig_data.sent[:l_pos] + [self.token] + orig_data.sent[r_pos + 1:] new_matches = orig_data.matches.copy() new_matches[self.token] = extracted extra_conf = math.sqrt(pos_conf * ent_conf) - 0.5 data = MatchData(orig_data.name, new_sent, new_matches, orig_data.conf + extra_conf) possible_matches.append(data) return possible_matches def save(self, prefix): prefix += '.' + self.token for i in self.edges: i.save(prefix) @classmethod def from_file(cls, prefix, token): prefix += '.' + token self = cls(token) for i in self.edges: i.load(prefix) return self def train(self, train_data): for i in self.edges: i.train(train_data) padatious-0.4.8/padatious/simple_intent.py000066400000000000000000000111601366271224600207410ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from fann2 import libfann as fann from padatious.id_manager import IdManager from padatious.util import resolve_conflicts, StrEnum class Ids(StrEnum): unknown_tokens = ':0' w_1 = ':1' w_2 = ':2' w_3 = ':3' w_4 = ':4' class SimpleIntent(object): """General intent used to match sentences or phrases""" LENIENCE = 0.6 def __init__(self, name=''): self.name = name self.ids = IdManager(Ids) self.net = None # type: fann.neural_net def match(self, sent): return max(0, self.net.run(self.vectorize(sent))[0]) def vectorize(self, sent): vector = self.ids.vector() unknown = 0 for token in sent: if token in self.ids: self.ids.assign(vector, token, 1.0) else: unknown += 1 if len(sent) > 0: self.ids.assign(vector, Ids.unknown_tokens, unknown / float(len(sent))) self.ids.assign(vector, Ids.w_1, len(sent) / 1) self.ids.assign(vector, Ids.w_2, len(sent) / 2.) self.ids.assign(vector, Ids.w_3, len(sent) / 3.) self.ids.assign(vector, Ids.w_4, len(sent) / 4.) return vector def configure_net(self): self.net = fann.neural_net() self.net.create_standard_array([len(self.ids), 10, 1]) self.net.set_activation_function_hidden(fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_activation_function_output(fann.SIGMOID_SYMMETRIC_STEPWISE) self.net.set_train_stop_function(fann.STOPFUNC_BIT) self.net.set_bit_fail_limit(0.1) def train(self, train_data): for sent in train_data.my_sents(self.name): self.ids.add_sent(sent) inputs = [] outputs = [] def add(vec, out): inputs.append(self.vectorize(vec)) outputs.append([out]) def pollute(sent, p): sent = sent[:] for _ in range(int((len(sent) + 2) / 3)): sent.insert(p, ':null:') add(sent, self.LENIENCE) def weight(sent): def calc_weight(w): return pow(len(w), 3.0) total_weight = 0.0 for word in sent: total_weight += calc_weight(word) for word in sent: weight = 0 if word.startswith('{') else calc_weight(word) add([word], weight / total_weight) for sent in train_data.my_sents(self.name): add(sent, 1.0) weight(sent) # Generate samples with extra unknown tokens unless # the sentence is supposed to allow unknown tokens via the special :0 if not any(word[0] == ':' and word != ':' for word in sent): pollute(sent, 0) pollute(sent, len(sent)) for sent in train_data.other_sents(self.name): add(sent, 0.0) add([':null:'], 0.0) add([], 0.0) for sent in train_data.my_sents(self.name): without_entities = sent[:] for i, token in enumerate(without_entities): if token.startswith('{'): without_entities[i] = ':null:' if without_entities != sent: add(without_entities, 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) train_data = fann.training_data() train_data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(train_data, 1000, 0, 0) self.net.test_data(train_data) if self.net.get_bit_fail() == 0: break def save(self, prefix): prefix += '.intent' self.net.save(str(prefix + '.net')) # Must have str() self.ids.save(prefix) @classmethod def from_file(cls, name, prefix): prefix += '.intent' self = cls(name) self.net = fann.neural_net() if not self.net.create_from_file(str(prefix + '.net')): # Must have str() raise FileNotFoundError(str(prefix + '.net')) self.ids.load(prefix) return self padatious-0.4.8/padatious/train_data.py000066400000000000000000000033011366271224600201730ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.util import tokenize, expand_parentheses, remove_comments class TrainData(object): """ Training data used to access collections of tokenized sentences in intent files """ def __init__(self): self.sent_lists = {} def add_lines(self, name, lines): lines = remove_comments(lines) self.sent_lists[name] = sum([expand_parentheses(tokenize(line)) for line in lines], []) self.sent_lists[name] = [i for i in self.sent_lists[name] if i] def remove_lines(self, name): if name in self.sent_lists: del self.sent_lists[name] def add_file(self, name, file_name): with open(file_name, 'r') as f: self.add_lines(name, f.readlines()) def all_sents(self): for _, sents in self.sent_lists.items(): for i in sents: yield i def my_sents(self, my_name): for i in self.sent_lists[my_name]: yield i def other_sents(self, my_name): for name, sents in self.sent_lists.items(): if name != my_name: for i in sents: yield i padatious-0.4.8/padatious/trainable.py000066400000000000000000000022661366271224600200370ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABCMeta, abstractmethod class Trainable(object): __metaclass__ = ABCMeta def __init__(self, name, hsh=b''): self.name = name self.hash = hsh def load_hash(self, prefix): with open(prefix + '.hash', 'rb') as f: self.hash = f.read() def save_hash(self, prefix): with open(prefix + '.hash', 'wb') as f: f.write(self.hash) @abstractmethod def train(self, data): pass @abstractmethod def save(self, prefix): pass @classmethod @abstractmethod def from_file(self, name, folder): pass padatious-0.4.8/padatious/training_manager.py000066400000000000000000000101361366271224600213760ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing as mp from functools import partial from multiprocessing.context import TimeoutError from os.path import join, isfile, isdir, splitext import padatious from padatious.train_data import TrainData from padatious.util import lines_hash def _train_and_save(obj, cache, data, print_updates): """Internal pickleable function used to train objects in another process""" obj.train(data) if print_updates: print('Regenerated ' + obj.name + '.') obj.save(cache) class TrainingManager(object): """ Manages multithreaded training of either Intents or Entities Args: cls (Type[Trainable]): Class to wrap cache_dir (str): Place to store cache files """ def __init__(self, cls, cache_dir): self.cls = cls self.cache = cache_dir self.objects = [] self.objects_to_train = [] self.train_data = TrainData() def add(self, name, lines, reload_cache=False, must_train=True): # special case: load persisted (aka. cached) resource (i.e. # entity or intent) from file into memory data structures if not must_train: self.objects.append( self.cls.from_file( name=name, folder=self.cache)) # general case: load resource (entity or intent) to training queue # or if no change occurred to memory data structures else: hash_fn = join(self.cache, name + '.hash') old_hsh = None if isfile(hash_fn): with open(hash_fn, 'rb') as g: old_hsh = g.read() min_ver = splitext(padatious.__version__)[0] new_hsh = lines_hash([min_ver] + lines) if reload_cache or old_hsh != new_hsh: self.objects_to_train.append(self.cls(name=name, hsh=new_hsh)) else: self.objects.append( self.cls.from_file( name=name, folder=self.cache)) self.train_data.add_lines(name, lines) def load(self, name, file_name, reload_cache=False): with open(file_name) as f: self.add(name, f.read().split('\n'), reload_cache) def remove(self, name): self.objects = [i for i in self.objects if i.name != name] self.objects_to_train = [ i for i in self.objects_to_train if i.name != name] self.train_data.remove_lines(name) def train(self, debug=True, single_thread=False, timeout=20): train = partial( _train_and_save, cache=self.cache, data=self.train_data, print_updates=debug) if single_thread: for i in self.objects_to_train: train(i) else: # Train in multiple processes to disk pool = mp.Pool() try: pool.map_async(train, self.objects_to_train).get(timeout) except TimeoutError: if debug: print('Some objects timed out while training') finally: pool.close() pool.join() # Load saved objects from disk for obj in self.objects_to_train: try: self.objects.append( self.cls.from_file( name=obj.name, folder=self.cache)) except IOError: if debug: print('Took too long to train', obj.name) self.objects_to_train = [] padatious-0.4.8/padatious/util.py000066400000000000000000000071061366271224600170510ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from xxhash import xxh32 from padatious.bracket_expansion import SentenceTreeParser def lines_hash(lines): """ Creates a unique binary id for the given lines Args: lines (list): List of strings that should be collectively hashed Returns: bytearray: Binary hash """ x = xxh32() for i in lines: x.update(i.encode()) return x.digest() def tokenize(sentence): """ Converts a single sentence into a list of individual significant units Args: sentence (str): Input string ie. 'This is a sentence.' Returns: list: List of tokens ie. ['this', 'is', 'a', 'sentence'] """ tokens = [] class Vars: start_pos = -1 last_type = 'o' def update(c, i): if c.isalpha() or c in '-{}': t = 'a' elif c.isdigit() or c == '#': t = 'n' elif c.isspace(): t = 's' else: t = 'o' if t != Vars.last_type or t == 'o': if Vars.start_pos >= 0: token = sentence[Vars.start_pos:i].lower() if token not in '.!?': tokens.append(token) Vars.start_pos = -1 if t == 's' else i Vars.last_type = t for i, char in enumerate(sentence): update(char, i) update(' ', len(sentence)) return tokens def expand_parentheses(sent): """ ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] For example: Will it (rain|pour) (today|tomorrow|)? ----> Will it rain today? Will it rain tomorrow? Will it rain? Will it pour today? Will it pour tomorrow? Will it pour? Args: sent (list): List of tokens in sentence Returns: list>: Multiple possible sentences from original """ return SentenceTreeParser(sent).expand_parentheses() def remove_comments(lines): return [i for i in lines if not i.startswith('//')] def resolve_conflicts(inputs, outputs): """ Checks for duplicate inputs and if there are any, remove one and set the output to the max of the two outputs Args: inputs (list>): Array of input vectors outputs (list>): Array of output vectors Returns: tuple: The modified inputs and outputs """ data = {} for inp, out in zip(inputs, outputs): tup = tuple(inp) if tup in data: data[tup].append(out) else: data[tup] = [out] inputs, outputs = [], [] for inp, outs in data.items(): inputs.append(list(inp)) combined = [0] * len(outs[0]) for i in range(len(combined)): combined[i] = max(j[i] for j in outs) outputs.append(combined) return inputs, outputs class StrEnum(object): """Enum with strings as keys. Implements items method""" @classmethod def values(cls): return [getattr(cls, i) for i in dir(cls) if not i.startswith("__") and i != 'values'] padatious-0.4.8/requirements.txt000077500000000000000000000000241366271224600170100ustar00rootroot00000000000000fann2 xxhash padaos padatious-0.4.8/setup.cfg000066400000000000000000000000321366271224600153410ustar00rootroot00000000000000[bdist_wheel] universal=1 padatious-0.4.8/setup.py000077500000000000000000000027121366271224600152440ustar00rootroot00000000000000#!/usr/bin/env python3 from os.path import join, basename, abspath, dirname from setuptools import setup with open(join(dirname(abspath(__file__)), 'requirements.txt')) as f: requirements = f.readlines() setup( name='padatious', version='0.4.7', # Also change in padatious/__init__.py description='A neural network intent parser', url='http://github.com/MycroftAI/padatious', author='Matthew Scholefield', author_email='matthew331199@gmail.com', license='Apache-2.0', packages=[ 'padatious' ], install_requires=requirements, zip_safe=True, classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'Topic :: Text Processing :: Linguistic', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.0', 'Programming Language :: Python :: 3.1', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ], entry_points={ 'console_scripts': [ 'padatious=padatious.__main__:main' ] }, keywords='intent-parser parser text text-processing', ) padatious-0.4.8/tests/000077500000000000000000000000001366271224600146675ustar00rootroot00000000000000padatious-0.4.8/tests/__init__.py000066400000000000000000000000001366271224600167660ustar00rootroot00000000000000padatious-0.4.8/tests/test_all.py000066400000000000000000000114351366271224600170540ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from os.path import isdir from shutil import rmtree from padatious.intent_container import IntentContainer class TestAll: def setup(self): self.cont = IntentContainer('temp') def test_simple(self): self.cont.add_intent('hello', [ 'hello', 'hi', 'how are you', 'whats up' ]) self.cont.add_intent('goodbye', [ 'see you', 'later', 'bye', 'goodbye', 'another time' ]) self.cont.train(False) data = self.cont.calc_intent('whats up') assert data.name == 'hello' assert data.conf > 0.5 def test_single_extraction(self): self.cont.add_intent('drive', [ 'drive to {place}', 'driver over to {place}', 'navigate to {place}' ]) self.cont.add_intent('swim', [ 'swim to {island}', 'swim across {ocean}' ]) self.cont.train(False) data = self.cont.calc_intent('navigate to los angelos') assert data.name == 'drive' assert data.conf > 0.5 assert data.matches == {'place': 'los angelos'} data = self.cont.calc_intent('swim to tahiti') assert data.name == 'swim' assert data.conf > 0.5 assert data.matches == {'island': 'tahiti'} def test_single_extraction_front_back(self): self.cont.add_intent('start.timer', [ 'Timer {duration}', '{duration} timer', ]) self.cont.train(False) data = self.cont.calc_intent('10 minute timer') assert data.name == 'start.timer' assert data.conf > 0.5 assert data.matches == {'duration': '10 minute'} def test_multi_extraction_easy(self): self.cont.add_intent('search', [ '(search for|find) {query}', '(search for|find) {query} (using|on) {engine}', '(using|on) {engine}, (search for|find) {query}' ]) self.cont.add_intent('order', [ 'order some {food} from {store}', 'place an order for {food} on {store}' ]) self.cont.train(False) data = self.cont.calc_intent('search for funny dog videos') assert data.name == 'search' assert data.matches == {'query': 'funny dog videos'} assert data.conf > 0.5 data = self.cont.calc_intent('search for bananas using foodio') assert data.name == 'search' assert data.matches == {'query': 'bananas', 'engine': 'foodio'} assert data.conf > 0.5 data = self.cont.calc_intent('search for big furry cats using the best search engine') assert data.name == 'search' assert data.matches == {'query': 'big furry cats', 'engine': 'the best search engine'} assert data.conf > 0.5 data = self.cont.calc_intent('place an order for a loaf of bread on foodbuywebsite') assert data.name == 'order' assert data.matches == {'food': 'a loaf of bread', 'store': 'foodbuywebsite'} assert data.conf > 0.5 def test_extraction_dependence(self): self.cont.add_intent('search', [ 'wiki {query}' ]) self.cont.train(False) data = self.cont.calc_intent('wiki') assert data.conf < 0.5 def test_entity_recognition(self): self.cont.add_intent('weather', [ 'weather for {place} {time}' ], True) self.cont.add_intent('time', [ 'what time is it', 'whats the time right now', 'what time is it at the moment', 'currently, what time is it' ], True) self.cont.add_entity('place', [ 'los angeles', 'california', 'new york', 'chicago' ]) self.cont.add_entity('time', [ 'right now', 'currently', 'at the moment', ]) self.cont.train(False) data = self.cont.calc_intent('weather for los angeles right now') assert data.name == 'weather' assert data.matches == {'place': 'los angeles', 'time': 'right now'} assert data.conf > 0.5 def teardown(self): if isdir('temp'): rmtree('temp') padatious-0.4.8/tests/test_container.py000066400000000000000000000174101366271224600202650ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from time import monotonic import os import pytest import random from os import mkdir from os.path import isdir, join from shutil import rmtree from padatious.intent_container import IntentContainer class TestIntentContainer: test_lines = ['this is a test\n', 'another test\n'] other_lines = ['something else\n', 'this is a different thing\n'] test_lines_with_entities = ['this is a {test}\n', 'another {test}\n'] other_lines_with_entities = [ 'something {other}\n', 'this is a {other} thing\n'] test_entities = ['test\n', 'assessment\n'] other_entities = ['else\n', 'different\n'] def setup(self): self.cont = IntentContainer('temp') def test_add_intent(self): self.cont.add_intent('test', self.test_lines) self.cont.add_intent('other', self.other_lines) def test_load_intent(self): if not isdir('temp'): mkdir('temp') fn1 = join('temp', 'test.txt') with open(fn1, 'w') as f: f.writelines(self.test_lines) fn2 = join('temp', 'other.txt') with open(fn2, 'w') as f: f.writelines(self.other_lines) self.cont.load_intent('test', fn1) self.cont.load_intent('other', fn1) assert len(self.cont.intents.train_data.sent_lists) == 2 def test_train(self): def test(a, b): self.setup() self.test_add_intent() self.cont.train(a, b) test(False, False) test(True, True) def _write_train_data(self): if not isdir('temp'): mkdir('temp') fn1 = join('temp', 'test.intent') with open(fn1, 'w') as f: f.writelines(self.test_lines_with_entities) fn2 = join('temp', 'other.intent') with open(fn2, 'w') as f: f.writelines(self.other_lines_with_entities) fn1 = join('temp', 'test.entity') with open(fn1, 'w') as f: f.writelines(self.test_entities) fn2 = join('temp', 'other.entity') with open(fn2, 'w') as f: f.writelines(self.other_entities) def test_instantiate_from_disk(self): # train and cache (i.e. persist) self.setup() self.test_add_intent() self.cont.add_entity('test', self.test_entities) self.cont.add_entity('other', self.other_entities) self.cont.train() self._write_train_data() # instantiate from disk (load cached files) self.setup() self.cont.instantiate_from_disk() assert len(self.cont.intents.train_data.sent_lists) == 0 assert len(self.cont.intents.objects_to_train) == 0 assert len(self.cont.intents.objects) == 2 result = self.cont.calc_intent('something different') assert result.matches['other'] == 'different' def _create_large_intent(self, depth): if depth == 0: return '(a|b|)' return '{0} {0}'.format(self._create_large_intent(depth - 1)) @pytest.mark.skipif( not os.environ.get('RUN_LONG'), reason="Takes a long time") def test_train_timeout(self): self.cont.add_intent('a', [ ' '.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(5)) for __ in range(300) ]) self.cont.add_intent('b', [ ' '.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(5)) for __ in range(300) ]) a = monotonic() self.cont.train(True, timeout=1) b = monotonic() assert b - a <= 2 a = monotonic() self.cont.train(True, timeout=1) b = monotonic() assert b - a <= 0.1 def test_train_timeout_subprocess(self): self.cont.add_intent('a', [ ' '.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(5)) for __ in range(300) ]) self.cont.add_intent('b', [ ' '.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(5)) for __ in range(300) ]) a = monotonic() assert not self.cont.train_subprocess(timeout=0.1) b = monotonic() assert b - a <= 1 def test_train_subprocess(self): self.cont.add_intent('timer', [ 'set a timer for {time} minutes', ]) self.cont.add_entity('time', [ '#', '##', '#:##', '##:##' ]) assert self.cont.train_subprocess(False, timeout=20) intent = self.cont.calc_intent('set timer for 3 minutes') assert intent.name == 'timer' assert intent.matches == {'time': '3'} def test_calc_intents(self): self.test_add_intent() self.cont.train(False) intents = self.cont.calc_intents('this is another test') assert ( intents[0].conf > intents[1].conf) == ( intents[0].name == 'test') assert self.cont.calc_intent('this is another test').name == 'test' def test_empty(self): self.cont.train(False) self.cont.calc_intent('hello') def _test_entities(self, namespace): self.cont.add_intent(namespace + 'intent', [ 'test {ent}' ]) self.cont.add_entity(namespace + 'ent', [ 'one' ]) self.cont.train(False) data = self.cont.calc_intent('test one') high_conf = data.conf assert data.conf > 0.5 assert data['ent'] == 'one' data = self.cont.calc_intent('test two') assert high_conf > data.conf assert 'ent' not in data def test_regular_entities(self): self._test_entities('') def test_namespaced_entities(self): self._test_entities('SkillName:') def test_remove(self): self.test_add_intent() self.cont.train(False) assert self.cont.calc_intent('This is a test').conf == 1.0 self.cont.remove_intent('test') assert self.cont.calc_intent('This is a test').conf < 0.5 self.cont.add_intent('thing', ['A {thing}']) self.cont.add_entity('thing', ['thing']) self.cont.train(False) assert self.cont.calc_intent('A dog').conf < 0.5 assert self.cont.calc_intent('A thing').conf == 1.0 self.cont.remove_entity('thing') assert self.cont.calc_intent('A dog').conf == 1.0 def test_overlap(self): self.cont.add_intent('song', ['play {song}']) self.cont.add_intent('news', ['play the news']) self.cont.train(False) assert self.cont.calc_intent('play the news').name == 'news' def test_overlap_backwards(self): self.cont.add_intent('song', ['play {song}']) self.cont.add_intent('news', ['play the news']) self.cont.train(False) assert self.cont.calc_intent('play the news').name == 'news' def test_generalize(self): self.cont.add_intent('timer', [ 'set a timer for {time} minutes', 'make a {time} minute timer' ]) self.cont.add_entity('time', [ '#', '##', '#:##', '##:##' ]) self.cont.train(False) intent = self.cont.calc_intent('make a timer for 3 minute') assert intent.name == 'timer' assert intent.matches == {'time': '3'} def teardown(self): if isdir('temp'): rmtree('temp') padatious-0.4.8/tests/test_entity_edge.py000066400000000000000000000024161366271224600206030ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.entity_edge import EntityEdge from padatious.train_data import TrainData class TestEntityEdge: def setup(self): self.data = TrainData() self.data.add_lines('', ['a {word} here', 'the {word} here']) self.le = EntityEdge(-1, '{word}', '') self.re = EntityEdge(+1, '{word}', '') def test_match(self): self.le.train(self.data) self.re.train(self.data) sent = ['a', '{word}', 'here'] assert self.le.match(sent, 1) > self.le.match(sent, 0) assert self.le.match(sent, 1) > self.le.match(sent, 2) assert self.re.match(sent, 1) > self.re.match(sent, 0) assert self.re.match(sent, 1) > self.re.match(sent, 2) padatious-0.4.8/tests/test_id_manager.py000066400000000000000000000037431366271224600203750ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from os.path import isdir from shutil import rmtree from padatious.id_manager import IdManager class TestIdManager: def test_add(self): ids = IdManager() assert 'a' not in ids ids.add_token('a') assert 'a' in ids ids.add_token('a') assert len(ids) == 1 ids.add_sent(['b', 'c']) assert len(ids) == 3 for i in ['b', 'c']: assert i in ids def test_vector(self): ids = IdManager() assert len(ids.vector()) == 0 ids.add_token('a') assert len(ids.vector()) == 1 ids.add_token('b') vec = ids.vector() ids.assign(vec, 'b', 0.5) assert vec == [0, 0.5] def test_assign(self): ids = IdManager(ids={'test': 0, 'word': 1}) vec = ids.vector() ids.assign(vec, 'test', 0.7) ids.assign(vec, 'word', 0.2) assert vec == [0.7, 0.2] def test_save_load(self): ids1 = IdManager() ids1.add_token('hi') ids1.add_token('hello') if not isdir('temp'): os.mkdir('temp') ids1.save('temp/temp') ids2 = IdManager() ids2.load('temp/temp') vec1 = ids1.vector() vec2 = ids2.vector() ids1.assign(vec1, 'hello', 3) ids2.assign(vec2, 'hello', 3) assert vec1 == vec2 def teardown(self): if isdir('temp'): rmtree('temp') padatious-0.4.8/tests/test_intent.py000066400000000000000000000042131366271224600176010ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from os import mkdir from os.path import isdir from shutil import rmtree from padatious.intent import Intent from padatious.train_data import TrainData class TestIntent: def setup(self): self.data = TrainData() self.data.add_lines('hi', ['hello', 'hi', 'hi there']) self.data.add_lines('bye', ['goodbye', 'bye', 'bye {person}', 'see you later']) self.i_hi = Intent('hi') self.i_bye = Intent('bye') self.i_hi.train(self.data) self.i_bye.train(self.data) def test_match(self): assert self.i_hi.match(['hi']).conf > self.i_hi.match(['bye']).conf assert self.i_hi.match(['hi']).conf > self.i_bye.match(['hi']).conf assert self.i_bye.match(['bye']).conf > self.i_bye.match(['hi']).conf assert self.i_bye.match(['bye']).conf > self.i_hi.match(['bye']).conf all = self.i_bye.match(['see', 'you', 'later']).conf assert all > self.i_hi.match(['see']).conf assert all > self.i_hi.match(['you']).conf assert all > self.i_hi.match(['later']).conf matches = self.i_bye.match(['bye', 'john']).matches assert len(matches) == 1 assert '{person}' in matches assert matches['{person}'] == ['john'] def test_save_load(self): if not isdir('temp'): mkdir('temp') self.i_hi.save('temp') self.i_bye.save('temp') self.i_hi = Intent.from_file('hi', 'temp') self.i_bye = Intent.from_file('bye', 'temp') self.test_match() def teardown(self): if isdir('temp'): rmtree('temp') padatious-0.4.8/tests/test_match_data.py000066400000000000000000000026321366271224600203700ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.match_data import MatchData class TestMatchData: def setup(self): self.match = MatchData('name', ['one', 'two'], {'{word}': ['value', 'tokens']}, 0.5) self.sentence = ["it", "'", "s", "a", "new", "sentence"] self.sentence2 = ["the", "parents", "'", "house"] def test_detokenize(self): self.match.detokenize() assert self.match.sent == 'one two' correct_match = MatchData('name', 'one two', {'word': 'value tokens'}, 0.5) assert self.match.__dict__ == correct_match.__dict__ def test_handle_apostrophes(self): joined_sentence = self.match.handle_apostrophes(self.sentence) joined_sentence2 = self.match.handle_apostrophes(self.sentence2) assert joined_sentence == "it's a new sentence" assert joined_sentence2 == "the parents' house" padatious-0.4.8/tests/test_train_data.py000066400000000000000000000025111366271224600204050ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from os.path import isfile from padatious.train_data import TrainData class TestTrainData: def setup(self): self.data = TrainData() with open('temp', 'w') as f: f.writelines(['hi']) def test_add_lines(self): self.data.add_file('hi', 'temp') self.data.add_lines('bye', ['bye']) self.data.add_lines('other', ['other']) def cmp(a, b): return set(' '.join(i) for i in a) == set(' '.join(i) for i in b) assert cmp(self.data.my_sents('hi'), [['hi']]) assert cmp(self.data.other_sents('hi'), [['bye'], ['other']]) assert cmp(self.data.all_sents(), [['hi'], ['bye'], ['other']]) def teardown(self): if isfile('temp'): os.remove('temp') padatious-0.4.8/tests/test_util.py000066400000000000000000000037501366271224600172620ustar00rootroot00000000000000# Copyright 2017 Mycroft AI, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from padatious.util import lines_hash, tokenize, resolve_conflicts, StrEnum, expand_parentheses def test_lines_hash(): assert lines_hash(['word1', 'word2']) != lines_hash(['word2', 'word1']) assert lines_hash(['word1', 'word2']) != lines_hash(['word1', 'word1']) def test_tokenize(): assert tokenize('one two three') == ['one', 'two', 'three'] assert tokenize('one1 two2') == ['one', '1', 'two', '2'] assert tokenize('word {ent}') == ['word', '{ent}'] assert tokenize('test:') == ['test', ':'] def test_expand_parentheses(): def xp(s): return {''.join(sent) for sent in expand_parentheses(tokenize(s))} assert xp('1 (2|3) 4 (5|6) 7') == {'12457', '12467', '13457', '13467'} assert xp('1 (2 3) 4') == {'1(23)4'} assert xp('1 (2 3|) 4') == {'1234', '14'} assert xp('1 (|2 3) 4') == {'1234', '14'} assert xp('1 ((2|4) (3|)) 4') == {'1(23)4', '1(2)4', '1(43)4', '1(4)4'} assert xp('1 (2|4|5) 4') == {'124', '144', '154'} assert xp('1 (2|4|5 (6|7)) 4') == {'124', '144', '1564', '1574'} def test_resolve_conflicts(): inputs = [[0, 1], [1, 1], [0, 1]] outputs = [[0.0], [0.5], [0.7]] inputs, outputs = resolve_conflicts(inputs, outputs) assert len(inputs) == 2 assert len(outputs) == 2 assert outputs[inputs.index([0, 1])] == [0.7] def test_str_enum(): class MyEnum(StrEnum): a = '1' b = '2' assert set(MyEnum.values()) == {'1', '2'}